1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ; Use of pmaxub instead of psubusb to compute filter mask was seen 15 ; in ffvp8 16 17 %macro LFH_FILTER_AND_HEV_MASK 1 18 %if %1 19 movdqa xmm2, [rdi+2*rax] ; q3 20 movdqa xmm1, [rsi+2*rax] ; q2 21 movdqa xmm4, [rsi+rax] ; q1 22 movdqa xmm5, [rsi] ; q0 23 neg rax ; negate pitch to deal with above border 24 %else 25 movlps xmm2, [rsi + rcx*2] ; q3 26 movlps xmm1, [rsi + rcx] ; q2 27 movlps xmm4, [rsi] ; q1 28 movlps xmm5, [rsi + rax] ; q0 29 30 movhps xmm2, [rdi + rcx*2] 31 movhps xmm1, [rdi + rcx] 32 movhps xmm4, [rdi] 33 movhps xmm5, [rdi + rax] 34 35 lea rsi, [rsi + rax*4] 36 lea rdi, [rdi + rax*4] 37 38 movdqa XMMWORD PTR [rsp], xmm1 ; store q2 39 movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 40 %endif 41 42 movdqa xmm6, xmm1 ; q2 43 movdqa xmm3, xmm4 ; q1 44 45 psubusb xmm1, xmm2 ; q2-=q3 46 psubusb xmm2, xmm6 ; q3-=q2 47 48 psubusb xmm4, xmm6 ; q1-=q2 49 psubusb xmm6, xmm3 ; q2-=q1 50 51 por xmm4, xmm6 ; abs(q2-q1) 52 por xmm1, xmm2 ; abs(q3-q2) 53 54 movdqa xmm0, xmm5 ; q0 55 pmaxub xmm1, xmm4 56 57 psubusb xmm5, xmm3 ; q0-=q1 58 psubusb xmm3, xmm0 ; q1-=q0 59 60 por xmm5, xmm3 ; abs(q0-q1) 61 movdqa t0, xmm5 ; save to t0 62 63 pmaxub xmm1, xmm5 64 65 %if %1 66 movdqa xmm2, [rsi+4*rax] ; p3 67 movdqa xmm4, [rdi+4*rax] ; p2 68 movdqa xmm6, [rsi+2*rax] ; p1 69 %else 70 movlps xmm2, [rsi + rax] ; p3 71 movlps xmm4, [rsi] ; p2 72 movlps xmm6, [rsi + rcx] ; p1 73 74 movhps xmm2, [rdi + rax] 75 movhps xmm4, [rdi] 76 movhps xmm6, [rdi + rcx] 77 78 movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 79 movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 80 %endif 81 82 movdqa xmm5, xmm4 ; p2 83 movdqa xmm3, xmm6 ; p1 84 85 psubusb xmm4, xmm2 ; p2-=p3 86 psubusb xmm2, xmm5 ; p3-=p2 87 88 psubusb xmm3, xmm5 ; p1-=p2 89 pmaxub xmm1, xmm4 ; abs(p3 - p2) 90 91 psubusb xmm5, xmm6 ; p2-=p1 92 pmaxub xmm1, xmm2 ; abs(p3 - p2) 93 94 pmaxub xmm1, xmm5 ; abs(p2 - p1) 95 movdqa xmm2, xmm6 ; p1 96 97 pmaxub xmm1, xmm3 ; abs(p2 - p1) 98 %if %1 99 movdqa xmm4, [rsi+rax] ; p0 100 movdqa xmm3, [rdi] ; q1 101 %else 102 movlps xmm4, [rsi + rcx*2] ; p0 103 movhps xmm4, [rdi + rcx*2] 104 movdqa xmm3, q1 ; q1 105 %endif 106 107 movdqa xmm5, xmm4 ; p0 108 psubusb xmm4, xmm6 ; p0-=p1 109 110 psubusb xmm6, xmm5 ; p1-=p0 111 112 por xmm6, xmm4 ; abs(p1 - p0) 113 mov rdx, arg(2) ; get flimit 114 115 movdqa t1, xmm6 ; save to t1 116 117 movdqa xmm4, xmm3 ; q1 118 pmaxub xmm1, xmm6 119 120 psubusb xmm3, xmm2 ; q1-=p1 121 psubusb xmm2, xmm4 ; p1-=q1 122 123 psubusb xmm1, xmm7 124 por xmm2, xmm3 ; abs(p1-q1) 125 126 movdqa xmm4, XMMWORD PTR [rdx] ; flimit 127 128 movdqa xmm3, xmm0 ; q0 129 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 130 131 mov rdx, arg(4) ; hev get thresh 132 133 movdqa xmm6, xmm5 ; p0 134 psrlw xmm2, 1 ; abs(p1-q1)/2 135 136 psubusb xmm5, xmm3 ; p0-=q0 137 paddb xmm4, xmm4 ; flimit*2 (less than 255) 138 139 psubusb xmm3, xmm6 ; q0-=p0 140 por xmm5, xmm3 ; abs(p0 - q0) 141 142 paddusb xmm5, xmm5 ; abs(p0-q0)*2 143 paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255) 144 145 movdqa xmm4, t0 ; hev get abs (q1 - q0) 146 147 movdqa xmm3, t1 ; get abs (p1 - p0) 148 149 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 150 151 movdqa xmm2, XMMWORD PTR [rdx] ; hev 152 153 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 154 psubusb xmm4, xmm2 ; hev 155 156 psubusb xmm3, xmm2 ; hev 157 por xmm1, xmm5 158 159 pxor xmm7, xmm7 160 paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 161 162 pcmpeqb xmm4, xmm5 ; hev 163 pcmpeqb xmm3, xmm3 ; hev 164 165 pcmpeqb xmm1, xmm7 ; mask xmm1 166 pxor xmm4, xmm3 ; hev 167 %endmacro 168 169 %macro B_FILTER 1 170 %if %1 == 0 171 movdqa xmm2, p1 ; p1 172 movdqa xmm7, q1 ; q1 173 %elif %1 == 1 174 movdqa xmm2, [rsi+2*rax] ; p1 175 movdqa xmm7, [rdi] ; q1 176 %elif %1 == 2 177 lea rdx, srct 178 179 movdqa xmm2, [rdx] ; p1 180 movdqa xmm7, [rdx+48] ; q1 181 movdqa xmm6, [rdx+16] ; p0 182 movdqa xmm0, [rdx+32] ; q0 183 %endif 184 185 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 186 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 187 188 psubsb xmm2, xmm7 ; p1 - q1 189 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 190 191 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 192 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 193 194 movdqa xmm3, xmm0 ; q0 195 psubsb xmm0, xmm6 ; q0 - p0 196 197 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 198 199 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 200 201 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 202 203 pand xmm1, xmm2 ; mask filter values we don't care about 204 205 movdqa xmm2, xmm1 206 207 paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 208 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 209 210 punpckhbw xmm5, xmm2 ; axbxcxdx 211 punpcklbw xmm2, xmm2 ; exfxgxhx 212 213 punpcklbw xmm0, xmm1 ; exfxgxhx 214 psraw xmm5, 11 ; sign extended shift right by 3 215 216 punpckhbw xmm1, xmm1 ; axbxcxdx 217 psraw xmm2, 11 ; sign extended shift right by 3 218 219 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 220 psraw xmm0, 11 ; sign extended shift right by 3 221 222 psraw xmm1, 11 ; sign extended shift right by 3 223 movdqa xmm5, xmm0 ; save results 224 225 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 226 paddsw xmm5, [GLOBAL(ones)] 227 228 paddsw xmm1, [GLOBAL(ones)] 229 psraw xmm5, 1 ; partial shifted one more time for 2nd tap 230 231 psraw xmm1, 1 ; partial shifted one more time for 2nd tap 232 233 paddsb xmm6, xmm2 ; p0+= p0 add 234 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 235 236 %if %1 == 0 237 movdqa xmm1, p1 ; p1 238 %elif %1 == 1 239 movdqa xmm1, [rsi+2*rax] ; p1 240 %elif %1 == 2 241 movdqa xmm1, [rdx] ; p1 242 %endif 243 pandn xmm4, xmm5 ; high edge variance additive 244 pxor xmm6, [GLOBAL(t80)] ; unoffset 245 246 pxor xmm1, [GLOBAL(t80)] ; reoffset 247 psubsb xmm3, xmm0 ; q0-= q0 add 248 249 paddsb xmm1, xmm4 ; p1+= p1 add 250 pxor xmm3, [GLOBAL(t80)] ; unoffset 251 252 pxor xmm1, [GLOBAL(t80)] ; unoffset 253 psubsb xmm7, xmm4 ; q1-= q1 add 254 255 pxor xmm7, [GLOBAL(t80)] ; unoffset 256 %if %1 == 0 257 lea rsi, [rsi + rcx*2] 258 lea rdi, [rdi + rcx*2] 259 movq MMWORD PTR [rsi], xmm6 ; p0 260 movhps MMWORD PTR [rdi], xmm6 261 movq MMWORD PTR [rsi + rax], xmm1 ; p1 262 movhps MMWORD PTR [rdi + rax], xmm1 263 movq MMWORD PTR [rsi + rcx], xmm3 ; q0 264 movhps MMWORD PTR [rdi + rcx], xmm3 265 movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 266 movhps MMWORD PTR [rdi + rcx*2],xmm7 267 %elif %1 == 1 268 movdqa [rsi+rax], xmm6 ; write back 269 movdqa [rsi+2*rax], xmm1 ; write back 270 movdqa [rsi], xmm3 ; write back 271 movdqa [rdi], xmm7 ; write back 272 %endif 273 274 %endmacro 275 276 277 ;void vp8_loop_filter_horizontal_edge_sse2 278 ;( 279 ; unsigned char *src_ptr, 280 ; int src_pixel_step, 281 ; const char *flimit, 282 ; const char *limit, 283 ; const char *thresh, 284 ; int count 285 ;) 286 global sym(vp8_loop_filter_horizontal_edge_sse2) 287 sym(vp8_loop_filter_horizontal_edge_sse2): 288 push rbp 289 mov rbp, rsp 290 SHADOW_ARGS_TO_STACK 6 291 SAVE_XMM 292 GET_GOT rbx 293 push rsi 294 push rdi 295 ; end prolog 296 297 ALIGN_STACK 16, rax 298 sub rsp, 32 ; reserve 32 bytes 299 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 300 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 301 302 mov rsi, arg(0) ;src_ptr 303 movsxd rax, dword ptr arg(1) ;src_pixel_step 304 305 mov rdx, arg(3) ;limit 306 movdqa xmm7, XMMWORD PTR [rdx] 307 308 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 309 310 ; calculate breakout conditions and high edge variance 311 LFH_FILTER_AND_HEV_MASK 1 312 ; filter and write back the result 313 B_FILTER 1 314 315 add rsp, 32 316 pop rsp 317 ; begin epilog 318 pop rdi 319 pop rsi 320 RESTORE_GOT 321 RESTORE_XMM 322 UNSHADOW_ARGS 323 pop rbp 324 ret 325 326 327 ;void vp8_loop_filter_horizontal_edge_uv_sse2 328 ;( 329 ; unsigned char *src_ptr, 330 ; int src_pixel_step, 331 ; const char *flimit, 332 ; const char *limit, 333 ; const char *thresh, 334 ; int count 335 ;) 336 global sym(vp8_loop_filter_horizontal_edge_uv_sse2) 337 sym(vp8_loop_filter_horizontal_edge_uv_sse2): 338 push rbp 339 mov rbp, rsp 340 SHADOW_ARGS_TO_STACK 6 341 SAVE_XMM 342 GET_GOT rbx 343 push rsi 344 push rdi 345 ; end prolog 346 347 ALIGN_STACK 16, rax 348 sub rsp, 96 ; reserve 96 bytes 349 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 350 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 351 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 352 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 353 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 354 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 355 356 mov rsi, arg(0) ; u 357 mov rdi, arg(5) ; v 358 movsxd rax, dword ptr arg(1) ; src_pixel_step 359 mov rcx, rax 360 neg rax ; negate pitch to deal with above border 361 362 mov rdx, arg(3) ;limit 363 movdqa xmm7, XMMWORD PTR [rdx] 364 365 lea rsi, [rsi + rcx] 366 lea rdi, [rdi + rcx] 367 368 ; calculate breakout conditions and high edge variance 369 LFH_FILTER_AND_HEV_MASK 0 370 ; filter and write back the result 371 B_FILTER 0 372 373 add rsp, 96 374 pop rsp 375 ; begin epilog 376 pop rdi 377 pop rsi 378 RESTORE_GOT 379 RESTORE_XMM 380 UNSHADOW_ARGS 381 pop rbp 382 ret 383 384 385 %macro MB_FILTER_AND_WRITEBACK 1 386 %if %1 == 0 387 movdqa xmm2, p1 ; p1 388 movdqa xmm7, q1 ; q1 389 %elif %1 == 1 390 movdqa xmm2, [rsi+2*rax] ; p1 391 movdqa xmm7, [rdi] ; q1 392 393 mov rcx, rax 394 neg rcx 395 %elif %1 == 2 396 lea rdx, srct 397 398 movdqa xmm2, [rdx+32] ; p1 399 movdqa xmm7, [rdx+80] ; q1 400 movdqa xmm6, [rdx+48] ; p0 401 movdqa xmm0, [rdx+64] ; q0 402 %endif 403 404 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 405 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 406 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 407 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 408 409 psubsb xmm2, xmm7 ; p1 - q1 410 movdqa xmm3, xmm0 ; q0 411 412 psubsb xmm0, xmm6 ; q0 - p0 413 414 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 415 416 paddsb xmm2, xmm0 ; 2 * (q0 - p0) 417 418 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 419 420 pand xmm1, xmm2 ; mask filter values we don't care about 421 422 movdqa xmm2, xmm1 ; vp8_filter 423 424 pand xmm2, xmm4 ; Filter2 = vp8_filter & hev 425 pxor xmm0, xmm0 426 427 pandn xmm4, xmm1 ; vp8_filter&=~hev 428 pxor xmm1, xmm1 429 430 punpcklbw xmm0, xmm4 ; Filter 2 (hi) 431 movdqa xmm5, xmm2 432 433 punpckhbw xmm1, xmm4 ; Filter 2 (lo) 434 paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) 435 436 pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9 437 438 pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9 439 440 punpckhbw xmm7, xmm5 ; axbxcxdx 441 paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 442 443 punpcklbw xmm5, xmm5 ; exfxgxhx 444 psraw xmm7, 11 ; sign extended shift right by 3 445 446 psraw xmm5, 11 ; sign extended shift right by 3 447 punpckhbw xmm4, xmm2 ; axbxcxdx 448 449 punpcklbw xmm2, xmm2 ; exfxgxhx 450 psraw xmm4, 11 ; sign extended shift right by 3 451 452 packsswb xmm5, xmm7 ; Filter2 >>=3; 453 psraw xmm2, 11 ; sign extended shift right by 3 454 455 packsswb xmm2, xmm4 ; Filter1 >>=3; 456 movdqa xmm7, xmm1 457 458 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 459 movdqa xmm4, xmm1 460 461 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 462 movdqa xmm5, xmm0 463 464 movdqa xmm2, xmm5 465 paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63 466 467 paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63 468 paddw xmm5, xmm5 ; Filter 2 (hi) * 18 469 470 paddw xmm7, xmm7 ; Filter 2 (lo) * 18 471 paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 472 473 paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 474 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 475 476 paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 477 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 478 479 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 480 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 481 482 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 483 psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 484 485 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 486 packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 487 488 psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 489 490 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 491 492 psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) 493 paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) 494 495 %if %1 == 0 496 movdqa xmm5, q2 ; q2 497 movdqa xmm1, q1 ; q1 498 movdqa xmm4, p1 ; p1 499 movdqa xmm7, p2 ; p2 500 501 %elif %1 == 1 502 movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2 503 movdqa xmm1, XMMWORD PTR [rdi] ; q1 504 movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1 505 movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2 506 %elif %1 == 2 507 movdqa xmm5, XMMWORD PTR [rdx+96] ; q2 508 movdqa xmm1, XMMWORD PTR [rdx+80] ; q1 509 movdqa xmm4, XMMWORD PTR [rdx+32] ; p1 510 movdqa xmm7, XMMWORD PTR [rdx+16] ; p2 511 %endif 512 513 pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80 514 pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80 515 516 pxor xmm1, [GLOBAL(t80)] 517 pxor xmm4, [GLOBAL(t80)] 518 519 psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) 520 paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) 521 522 pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80; 523 pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80; 524 525 pxor xmm7, [GLOBAL(t80)] 526 pxor xmm5, [GLOBAL(t80)] 527 528 paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) 529 psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) 530 531 pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80; 532 pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80; 533 534 %if %1 == 0 535 lea rsi, [rsi+rcx*2] 536 lea rdi, [rdi+rcx*2] 537 538 movq MMWORD PTR [rsi], xmm6 ; p0 539 movhps MMWORD PTR [rdi], xmm6 540 movq MMWORD PTR [rsi + rcx], xmm3 ; q0 541 movhps MMWORD PTR [rdi + rcx], xmm3 542 543 movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1 544 movhps MMWORD PTR [rdi+rcx*2], xmm1 545 546 movq MMWORD PTR [rsi + rax], xmm4 ; p1 547 movhps MMWORD PTR [rdi + rax], xmm4 548 549 movq MMWORD PTR [rsi+rax*2], xmm7 ; p2 550 movhps MMWORD PTR [rdi+rax*2], xmm7 551 552 lea rsi, [rsi + rcx] 553 lea rdi, [rdi + rcx] 554 movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2 555 movhps MMWORD PTR [rdi+rcx*2], xmm5 556 %elif %1 == 1 557 movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2 558 movdqa XMMWORD PTR [rdi], xmm1 ; q1 559 movdqa XMMWORD PTR [rsi], xmm3 ; q0 560 movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0 561 movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1 562 movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2 563 %elif %1 == 2 564 movdqa XMMWORD PTR [rdx+80], xmm1 ; q1 565 movdqa XMMWORD PTR [rdx+64], xmm3 ; q0 566 movdqa XMMWORD PTR [rdx+48], xmm6 ; p0 567 movdqa XMMWORD PTR [rdx+32], xmm4 ; p1 568 %endif 569 570 %endmacro 571 572 573 ;void vp8_mbloop_filter_horizontal_edge_sse2 574 ;( 575 ; unsigned char *src_ptr, 576 ; int src_pixel_step, 577 ; const char *flimit, 578 ; const char *limit, 579 ; const char *thresh, 580 ; int count 581 ;) 582 global sym(vp8_mbloop_filter_horizontal_edge_sse2) 583 sym(vp8_mbloop_filter_horizontal_edge_sse2): 584 push rbp 585 mov rbp, rsp 586 SHADOW_ARGS_TO_STACK 6 587 SAVE_XMM 588 GET_GOT rbx 589 push rsi 590 push rdi 591 ; end prolog 592 593 ALIGN_STACK 16, rax 594 sub rsp, 32 ; reserve 32 bytes 595 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 596 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 597 598 mov rsi, arg(0) ;src_ptr 599 movsxd rax, dword ptr arg(1) ;src_pixel_step 600 601 mov rdx, arg(3) ;limit 602 movdqa xmm7, XMMWORD PTR [rdx] 603 604 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 605 606 ; calculate breakout conditions and high edge variance 607 LFH_FILTER_AND_HEV_MASK 1 608 ; filter and write back the results 609 MB_FILTER_AND_WRITEBACK 1 610 611 add rsp, 32 612 pop rsp 613 ; begin epilog 614 pop rdi 615 pop rsi 616 RESTORE_GOT 617 RESTORE_XMM 618 UNSHADOW_ARGS 619 pop rbp 620 ret 621 622 623 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2 624 ;( 625 ; unsigned char *u, 626 ; int src_pixel_step, 627 ; const char *flimit, 628 ; const char *limit, 629 ; const char *thresh, 630 ; unsigned char *v 631 ;) 632 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) 633 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 634 push rbp 635 mov rbp, rsp 636 SHADOW_ARGS_TO_STACK 6 637 SAVE_XMM 638 GET_GOT rbx 639 push rsi 640 push rdi 641 ; end prolog 642 643 ALIGN_STACK 16, rax 644 sub rsp, 96 ; reserve 96 bytes 645 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; 646 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; 647 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; 648 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; 649 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; 650 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; 651 652 mov rsi, arg(0) ; u 653 mov rdi, arg(5) ; v 654 movsxd rax, dword ptr arg(1) ; src_pixel_step 655 mov rcx, rax 656 neg rax ; negate pitch to deal with above border 657 658 mov rdx, arg(3) ;limit 659 movdqa xmm7, XMMWORD PTR [rdx] 660 661 lea rsi, [rsi + rcx] 662 lea rdi, [rdi + rcx] 663 664 ; calculate breakout conditions and high edge variance 665 LFH_FILTER_AND_HEV_MASK 0 666 ; filter and write back the results 667 MB_FILTER_AND_WRITEBACK 0 668 669 add rsp, 96 670 pop rsp 671 ; begin epilog 672 pop rdi 673 pop rsi 674 RESTORE_GOT 675 RESTORE_XMM 676 UNSHADOW_ARGS 677 pop rbp 678 ret 679 680 681 %macro TRANSPOSE_16X8 2 682 movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 683 movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 684 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 685 movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 686 movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 687 movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 688 689 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 690 691 movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 692 693 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 694 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 695 696 movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 697 698 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 699 %if %1 700 lea rsi, [rsi+rax*8] 701 %else 702 mov rsi, arg(5) ; v_ptr 703 %endif 704 705 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 706 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 707 708 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 709 710 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 711 %if %1 712 lea rdi, [rdi+rax*8] 713 %else 714 lea rsi, [rsi - 4] 715 %endif 716 717 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 718 %if %1 719 lea rdx, srct 720 %else 721 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 722 %endif 723 724 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 725 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 726 727 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 728 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 729 730 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 731 732 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 733 734 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 735 736 movdqa t0, xmm2 ; save to free XMM2 737 movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 738 movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 739 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 740 movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 741 movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 742 743 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 744 745 movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 746 747 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 748 749 movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 750 751 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 752 753 movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 754 755 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 756 757 movdqa xmm6, xmm1 ; 758 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 759 760 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 761 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 762 763 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 764 765 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 766 767 movdqa xmm0, xmm5 768 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 769 770 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 771 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 772 773 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 774 775 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 776 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 777 778 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 779 780 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 781 %if %2 782 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 783 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 784 785 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 786 787 movdqa [rdx], xmm2 ; save 2 788 789 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 790 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 791 792 movdqa [rdx+16], xmm3 ; save 3 793 794 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 795 796 movdqa [rdx+32], xmm4 ; save 4 797 movdqa [rdx+48], xmm5 ; save 5 798 movdqa xmm1, t0 ; get 799 800 movdqa xmm2, xmm1 ; 801 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 802 803 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 804 %else 805 movdqa [rdx+112], xmm7 ; save 7 806 807 movdqa [rdx+96], xmm6 ; save 6 808 809 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 810 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 811 812 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 813 814 movdqa [rdx+32], xmm2 ; save 2 815 816 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 817 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 818 819 movdqa [rdx+48], xmm3 ; save 3 820 821 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 822 823 movdqa [rdx+64], xmm4 ; save 4 824 movdqa [rdx+80], xmm5 ; save 5 825 movdqa xmm1, t0 ; get 826 827 movdqa xmm2, xmm1 828 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 829 830 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 831 832 movdqa [rdx+16], xmm1 833 834 movdqa [rdx], xmm2 835 %endif 836 %endmacro 837 838 %macro LFV_FILTER_MASK_HEV_MASK 1 839 movdqa xmm0, xmm6 ; q2 840 psubusb xmm0, xmm7 ; q2-q3 841 842 psubusb xmm7, xmm6 ; q3-q2 843 movdqa xmm4, xmm5 ; q1 844 845 por xmm7, xmm0 ; abs (q3-q2) 846 psubusb xmm4, xmm6 ; q1-q2 847 848 movdqa xmm0, xmm1 849 psubusb xmm6, xmm5 ; q2-q1 850 851 por xmm6, xmm4 ; abs (q2-q1) 852 psubusb xmm0, xmm2 ; p2 - p3; 853 854 psubusb xmm2, xmm1 ; p3 - p2; 855 por xmm0, xmm2 ; abs(p2-p3) 856 %if %1 857 movdqa xmm2, [rdx] ; p1 858 %else 859 movdqa xmm2, [rdx+32] ; p1 860 %endif 861 movdqa xmm5, xmm2 ; p1 862 pmaxub xmm0, xmm7 863 864 psubusb xmm5, xmm1 ; p1-p2 865 psubusb xmm1, xmm2 ; p2-p1 866 867 movdqa xmm7, xmm3 ; p0 868 psubusb xmm7, xmm2 ; p0-p1 869 870 por xmm1, xmm5 ; abs(p2-p1) 871 pmaxub xmm0, xmm6 872 873 pmaxub xmm0, xmm1 874 movdqa xmm1, xmm2 ; p1 875 876 psubusb xmm2, xmm3 ; p1-p0 877 lea rdx, srct 878 879 por xmm2, xmm7 ; abs(p1-p0) 880 881 movdqa t0, xmm2 ; save abs(p1-p0) 882 883 pmaxub xmm0, xmm2 884 885 %if %1 886 movdqa xmm5, [rdx+32] ; q0 887 movdqa xmm7, [rdx+48] ; q1 888 %else 889 movdqa xmm5, [rdx+64] ; q0 890 movdqa xmm7, [rdx+80] ; q1 891 %endif 892 mov rdx, arg(3) ; limit 893 894 movdqa xmm6, xmm5 ; q0 895 movdqa xmm2, xmm7 ; q1 896 897 psubusb xmm5, xmm7 ; q0-q1 898 psubusb xmm7, xmm6 ; q1-q0 899 900 por xmm7, xmm5 ; abs(q1-q0) 901 902 movdqa t1, xmm7 ; save abs(q1-q0) 903 904 movdqa xmm4, XMMWORD PTR [rdx]; limit 905 906 pmaxub xmm0, xmm7 907 mov rdx, arg(2) ; flimit 908 909 psubusb xmm0, xmm4 910 movdqa xmm5, xmm2 ; q1 911 912 psubusb xmm5, xmm1 ; q1-=p1 913 psubusb xmm1, xmm2 ; p1-=q1 914 915 por xmm5, xmm1 ; abs(p1-q1) 916 movdqa xmm1, xmm3 ; p0 917 918 pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 919 psubusb xmm1, xmm6 ; p0-q0 920 921 psrlw xmm5, 1 ; abs(p1-q1)/2 922 psubusb xmm6, xmm3 ; q0-p0 923 924 movdqa xmm2, XMMWORD PTR [rdx]; flimit 925 926 mov rdx, arg(4) ; get thresh 927 928 por xmm1, xmm6 ; abs(q0-p0) 929 paddb xmm2, xmm2 ; flimit*2 (less than 255) 930 931 movdqa xmm6, t0 ; get abs (q1 - q0) 932 933 paddusb xmm1, xmm1 ; abs(q0-p0)*2 934 935 movdqa xmm3, t1 ; get abs (p1 - p0) 936 937 movdqa xmm7, XMMWORD PTR [rdx] 938 939 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 940 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh 941 942 paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255) 943 psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh 944 945 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 946 por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 947 948 por xmm1, xmm0 ; mask 949 pcmpeqb xmm6, xmm0 950 951 pxor xmm0, xmm0 952 pcmpeqb xmm4, xmm4 953 954 pcmpeqb xmm1, xmm0 955 pxor xmm4, xmm6 956 %endmacro 957 958 %macro BV_TRANSPOSE 0 959 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 960 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 961 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 962 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 963 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 964 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 965 966 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 967 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 968 969 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 970 971 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 972 973 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 974 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 975 976 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 977 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 978 979 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 980 981 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 982 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 983 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 984 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 985 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 986 %endmacro 987 988 %macro BV_WRITEBACK 2 989 movd [rsi+2], %1 990 psrldq %1, 4 991 992 movd [rdi+2], %1 993 psrldq %1, 4 994 995 movd [rsi+2*rax+2], %1 996 psrldq %1, 4 997 998 movd [rdi+2*rax+2], %1 999 1000 movd [rsi+4*rax+2], %2 1001 psrldq %2, 4 1002 1003 movd [rdi+4*rax+2], %2 1004 psrldq %2, 4 1005 1006 movd [rsi+2*rcx+2], %2 1007 psrldq %2, 4 1008 1009 movd [rdi+2*rcx+2], %2 1010 %endmacro 1011 1012 1013 ;void vp8_loop_filter_vertical_edge_sse2 1014 ;( 1015 ; unsigned char *src_ptr, 1016 ; int src_pixel_step, 1017 ; const char *flimit, 1018 ; const char *limit, 1019 ; const char *thresh, 1020 ; int count 1021 ;) 1022 global sym(vp8_loop_filter_vertical_edge_sse2) 1023 sym(vp8_loop_filter_vertical_edge_sse2): 1024 push rbp 1025 mov rbp, rsp 1026 SHADOW_ARGS_TO_STACK 6 1027 SAVE_XMM 1028 GET_GOT rbx 1029 push rsi 1030 push rdi 1031 ; end prolog 1032 1033 ALIGN_STACK 16, rax 1034 sub rsp, 96 ; reserve 96 bytes 1035 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1036 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1037 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 1038 1039 mov rsi, arg(0) ; src_ptr 1040 movsxd rax, dword ptr arg(1) ; src_pixel_step 1041 1042 lea rsi, [rsi - 4] 1043 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1044 lea rcx, [rax*2+rax] 1045 1046 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1047 TRANSPOSE_16X8 1, 1 1048 1049 ; calculate filter mask and high edge variance 1050 LFV_FILTER_MASK_HEV_MASK 1 1051 1052 ; start work on filters 1053 B_FILTER 2 1054 1055 ; tranpose and write back - only work on q1, q0, p0, p1 1056 BV_TRANSPOSE 1057 ; store 16-line result 1058 1059 lea rdx, [rax] 1060 neg rdx 1061 1062 BV_WRITEBACK xmm1, xmm5 1063 1064 lea rsi, [rsi+rdx*8] 1065 lea rdi, [rdi+rdx*8] 1066 BV_WRITEBACK xmm2, xmm6 1067 1068 add rsp, 96 1069 pop rsp 1070 ; begin epilog 1071 pop rdi 1072 pop rsi 1073 RESTORE_GOT 1074 RESTORE_XMM 1075 UNSHADOW_ARGS 1076 pop rbp 1077 ret 1078 1079 1080 ;void vp8_loop_filter_vertical_edge_uv_sse2 1081 ;( 1082 ; unsigned char *u, 1083 ; int src_pixel_step, 1084 ; const char *flimit, 1085 ; const char *limit, 1086 ; const char *thresh, 1087 ; unsigned char *v 1088 ;) 1089 global sym(vp8_loop_filter_vertical_edge_uv_sse2) 1090 sym(vp8_loop_filter_vertical_edge_uv_sse2): 1091 push rbp 1092 mov rbp, rsp 1093 SHADOW_ARGS_TO_STACK 6 1094 SAVE_XMM 1095 GET_GOT rbx 1096 push rsi 1097 push rdi 1098 ; end prolog 1099 1100 ALIGN_STACK 16, rax 1101 sub rsp, 96 ; reserve 96 bytes 1102 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1103 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1104 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; 1105 1106 mov rsi, arg(0) ; u_ptr 1107 movsxd rax, dword ptr arg(1) ; src_pixel_step 1108 1109 lea rsi, [rsi - 4] 1110 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1111 lea rcx, [rax+2*rax] 1112 1113 lea rdx, srct 1114 1115 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1116 TRANSPOSE_16X8 0, 1 1117 1118 ; calculate filter mask and high edge variance 1119 LFV_FILTER_MASK_HEV_MASK 1 1120 1121 ; start work on filters 1122 B_FILTER 2 1123 1124 ; tranpose and write back - only work on q1, q0, p0, p1 1125 BV_TRANSPOSE 1126 1127 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1128 1129 ; store 16-line result 1130 BV_WRITEBACK xmm1, xmm5 1131 1132 mov rsi, arg(0) ; u_ptr 1133 lea rsi, [rsi - 4] 1134 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1135 BV_WRITEBACK xmm2, xmm6 1136 1137 add rsp, 96 1138 pop rsp 1139 ; begin epilog 1140 pop rdi 1141 pop rsi 1142 RESTORE_GOT 1143 RESTORE_XMM 1144 UNSHADOW_ARGS 1145 pop rbp 1146 ret 1147 1148 %macro MBV_TRANSPOSE 0 1149 movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1150 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1151 1152 punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1153 punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1154 1155 movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1156 movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1157 1158 punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1159 punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1160 1161 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1162 punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1163 1164 punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1165 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1166 1167 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1168 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1169 1170 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1171 punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1172 1173 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1174 punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1175 1176 movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1177 punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1178 1179 punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1180 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1181 1182 punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1183 punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1184 %endmacro 1185 1186 %macro MBV_WRITEBACK_1 0 1187 movq QWORD PTR [rsi], xmm0 1188 movhps MMWORD PTR [rdi], xmm0 1189 1190 movq QWORD PTR [rsi+2*rax], xmm6 1191 movhps MMWORD PTR [rdi+2*rax], xmm6 1192 1193 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1194 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 1195 1196 punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 1197 1198 movq QWORD PTR [rsi+4*rax], xmm0 1199 movhps MMWORD PTR [rdi+4*rax], xmm0 1200 1201 movq QWORD PTR [rsi+2*rcx], xmm3 1202 movhps MMWORD PTR [rdi+2*rcx], xmm3 1203 1204 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1205 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1206 1207 punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1208 movdqa xmm0, xmm2 1209 1210 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1211 punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 1212 1213 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1214 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 1215 1216 punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1217 %endmacro 1218 1219 %macro MBV_WRITEBACK_2 0 1220 movq QWORD PTR [rsi], xmm1 1221 movhps MMWORD PTR [rdi], xmm1 1222 1223 movq QWORD PTR [rsi+2*rax], xmm5 1224 movhps MMWORD PTR [rdi+2*rax], xmm5 1225 1226 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1227 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1228 punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1229 1230 movq QWORD PTR [rsi+4*rax], xmm1 1231 movhps MMWORD PTR [rdi+4*rax], xmm1 1232 1233 movq QWORD PTR [rsi+2*rcx], xmm4 1234 movhps MMWORD PTR [rdi+2*rcx], xmm4 1235 %endmacro 1236 1237 1238 ;void vp8_mbloop_filter_vertical_edge_sse2 1239 ;( 1240 ; unsigned char *src_ptr, 1241 ; int src_pixel_step, 1242 ; const char *flimit, 1243 ; const char *limit, 1244 ; const char *thresh, 1245 ; int count 1246 ;) 1247 global sym(vp8_mbloop_filter_vertical_edge_sse2) 1248 sym(vp8_mbloop_filter_vertical_edge_sse2): 1249 push rbp 1250 mov rbp, rsp 1251 SHADOW_ARGS_TO_STACK 6 1252 SAVE_XMM 1253 GET_GOT rbx 1254 push rsi 1255 push rdi 1256 ; end prolog 1257 1258 ALIGN_STACK 16, rax 1259 sub rsp, 160 ; reserve 160 bytes 1260 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1261 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1262 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 1263 1264 mov rsi, arg(0) ; src_ptr 1265 movsxd rax, dword ptr arg(1) ; src_pixel_step 1266 1267 lea rsi, [rsi - 4] 1268 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1269 lea rcx, [rax*2+rax] 1270 1271 ; Transpose 1272 TRANSPOSE_16X8 1, 0 1273 1274 ; calculate filter mask and high edge variance 1275 LFV_FILTER_MASK_HEV_MASK 0 1276 1277 neg rax 1278 ; start work on filters 1279 MB_FILTER_AND_WRITEBACK 2 1280 1281 lea rsi, [rsi+rax*8] 1282 lea rdi, [rdi+rax*8] 1283 1284 ; transpose and write back 1285 MBV_TRANSPOSE 1286 1287 neg rax 1288 1289 MBV_WRITEBACK_1 1290 1291 lea rsi, [rsi+rax*8] 1292 lea rdi, [rdi+rax*8] 1293 MBV_WRITEBACK_2 1294 1295 add rsp, 160 1296 pop rsp 1297 ; begin epilog 1298 pop rdi 1299 pop rsi 1300 RESTORE_GOT 1301 RESTORE_XMM 1302 UNSHADOW_ARGS 1303 pop rbp 1304 ret 1305 1306 1307 ;void vp8_mbloop_filter_vertical_edge_uv_sse2 1308 ;( 1309 ; unsigned char *u, 1310 ; int src_pixel_step, 1311 ; const char *flimit, 1312 ; const char *limit, 1313 ; const char *thresh, 1314 ; unsigned char *v 1315 ;) 1316 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) 1317 sym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1318 push rbp 1319 mov rbp, rsp 1320 SHADOW_ARGS_TO_STACK 6 1321 SAVE_XMM 1322 GET_GOT rbx 1323 push rsi 1324 push rdi 1325 ; end prolog 1326 1327 ALIGN_STACK 16, rax 1328 sub rsp, 160 ; reserve 160 bytes 1329 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1330 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1331 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; 1332 1333 mov rsi, arg(0) ; u_ptr 1334 movsxd rax, dword ptr arg(1) ; src_pixel_step 1335 1336 lea rsi, [rsi - 4] 1337 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1338 lea rcx, [rax+2*rax] 1339 1340 lea rdx, srct 1341 1342 ; Transpose 1343 TRANSPOSE_16X8 0, 0 1344 1345 ; calculate filter mask and high edge variance 1346 LFV_FILTER_MASK_HEV_MASK 0 1347 1348 ; start work on filters 1349 MB_FILTER_AND_WRITEBACK 2 1350 1351 ; transpose and write back 1352 MBV_TRANSPOSE 1353 1354 mov rsi, arg(0) ;u_ptr 1355 lea rsi, [rsi - 4] 1356 lea rdi, [rsi + rax] 1357 MBV_WRITEBACK_1 1358 mov rsi, arg(5) ;v_ptr 1359 lea rsi, [rsi - 4] 1360 lea rdi, [rsi + rax] 1361 MBV_WRITEBACK_2 1362 1363 add rsp, 160 1364 pop rsp 1365 ; begin epilog 1366 pop rdi 1367 pop rsi 1368 RESTORE_GOT 1369 RESTORE_XMM 1370 UNSHADOW_ARGS 1371 pop rbp 1372 ret 1373 1374 1375 ;void vp8_loop_filter_simple_horizontal_edge_sse2 1376 ;( 1377 ; unsigned char *src_ptr, 1378 ; int src_pixel_step, 1379 ; const char *flimit, 1380 ; const char *limit, 1381 ; const char *thresh, 1382 ; int count 1383 ;) 1384 global sym(vp8_loop_filter_simple_horizontal_edge_sse2) 1385 sym(vp8_loop_filter_simple_horizontal_edge_sse2): 1386 push rbp 1387 mov rbp, rsp 1388 SHADOW_ARGS_TO_STACK 6 1389 SAVE_XMM 1390 GET_GOT rbx 1391 push rsi 1392 push rdi 1393 ; end prolog 1394 1395 mov rsi, arg(0) ;src_ptr 1396 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1397 mov rdx, arg(2) ;flimit ; get flimit 1398 movdqa xmm3, XMMWORD PTR [rdx] 1399 mov rdx, arg(3) ;limit 1400 movdqa xmm7, XMMWORD PTR [rdx] 1401 1402 paddb xmm3, xmm3 ; flimit*2 (less than 255) 1403 paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255) 1404 1405 mov rdi, rsi ; rdi points to row +1 for indirect addressing 1406 add rdi, rax 1407 neg rax 1408 1409 ; calculate mask 1410 movdqu xmm1, [rsi+2*rax] ; p1 1411 movdqu xmm0, [rdi] ; q1 1412 movdqa xmm2, xmm1 1413 movdqa xmm7, xmm0 1414 movdqa xmm4, xmm0 1415 psubusb xmm0, xmm1 ; q1-=p1 1416 psubusb xmm1, xmm4 ; p1-=q1 1417 por xmm1, xmm0 ; abs(p1-q1) 1418 pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero 1419 psrlw xmm1, 1 ; abs(p1-q1)/2 1420 1421 movdqu xmm5, [rsi+rax] ; p0 1422 movdqu xmm4, [rsi] ; q0 1423 movdqa xmm0, xmm4 ; q0 1424 movdqa xmm6, xmm5 ; p0 1425 psubusb xmm5, xmm4 ; p0-=q0 1426 psubusb xmm4, xmm6 ; q0-=p0 1427 por xmm5, xmm4 ; abs(p0 - q0) 1428 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1429 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1430 1431 psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1432 pxor xmm3, xmm3 1433 pcmpeqb xmm5, xmm3 1434 1435 ; start work on filters 1436 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values 1437 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values 1438 psubsb xmm2, xmm7 ; p1 - q1 1439 1440 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 1441 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values 1442 movdqa xmm3, xmm0 ; q0 1443 psubsb xmm0, xmm6 ; q0 - p0 1444 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 1445 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 1446 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 1447 pand xmm5, xmm2 ; mask filter values we don't care about 1448 1449 ; do + 4 side 1450 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1451 1452 movdqa xmm0, xmm5 ; get a copy of filters 1453 psllw xmm0, 8 ; shift left 8 1454 psraw xmm0, 3 ; arithmetic shift right 11 1455 psrlw xmm0, 8 1456 movdqa xmm1, xmm5 ; get a copy of filters 1457 psraw xmm1, 11 ; arithmetic shift right 11 1458 psllw xmm1, 8 ; shift left 8 to put it back 1459 1460 por xmm0, xmm1 ; put the two together to get result 1461 1462 psubsb xmm3, xmm0 ; q0-= q0 add 1463 pxor xmm3, [GLOBAL(t80)] ; unoffset 1464 movdqu [rsi], xmm3 ; write back 1465 1466 ; now do +3 side 1467 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 1468 1469 movdqa xmm0, xmm5 ; get a copy of filters 1470 psllw xmm0, 8 ; shift left 8 1471 psraw xmm0, 3 ; arithmetic shift right 11 1472 psrlw xmm0, 8 1473 psraw xmm5, 11 ; arithmetic shift right 11 1474 psllw xmm5, 8 ; shift left 8 to put it back 1475 por xmm0, xmm5 ; put the two together to get result 1476 1477 1478 paddsb xmm6, xmm0 ; p0+= p0 add 1479 pxor xmm6, [GLOBAL(t80)] ; unoffset 1480 movdqu [rsi+rax], xmm6 ; write back 1481 1482 ; begin epilog 1483 pop rdi 1484 pop rsi 1485 RESTORE_GOT 1486 RESTORE_XMM 1487 UNSHADOW_ARGS 1488 pop rbp 1489 ret 1490 1491 1492 ;void vp8_loop_filter_simple_vertical_edge_sse2 1493 ;( 1494 ; unsigned char *src_ptr, 1495 ; int src_pixel_step, 1496 ; const char *flimit, 1497 ; const char *limit, 1498 ; const char *thresh, 1499 ; int count 1500 ;) 1501 global sym(vp8_loop_filter_simple_vertical_edge_sse2) 1502 sym(vp8_loop_filter_simple_vertical_edge_sse2): 1503 push rbp ; save old base pointer value. 1504 mov rbp, rsp ; set new base pointer value. 1505 SHADOW_ARGS_TO_STACK 6 1506 SAVE_XMM 1507 GET_GOT rbx ; save callee-saved reg 1508 push rsi 1509 push rdi 1510 ; end prolog 1511 1512 ALIGN_STACK 16, rax 1513 sub rsp, 32 ; reserve 32 bytes 1514 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1515 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1516 1517 mov rsi, arg(0) ;src_ptr 1518 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1519 1520 lea rsi, [rsi - 2 ] 1521 lea rdi, [rsi + rax] 1522 lea rdx, [rsi + rax*4] 1523 lea rcx, [rdx + rax] 1524 1525 movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 1526 movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 1527 movdqu xmm2, [rdi] ; 13 12 11 10 1528 movdqu xmm3, [rcx] ; 53 52 51 50 1529 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 1530 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 1531 1532 movdqu xmm4, [rsi + rax*2] ; 23 22 21 20 1533 movdqu xmm5, [rdx + rax*2] ; 63 62 61 60 1534 movdqu xmm6, [rdi + rax*2] ; 33 32 31 30 1535 movdqu xmm7, [rcx + rax*2] ; 73 72 71 70 1536 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 1537 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 1538 1539 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 1540 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 1541 1542 movdqa xmm1, xmm0 1543 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 1544 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 1545 1546 movdqa xmm2, xmm0 1547 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1548 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1549 1550 movdqa t0, xmm0 ; save to t0 1551 movdqa t1, xmm2 ; save to t1 1552 1553 lea rsi, [rsi + rax*8] 1554 lea rdi, [rsi + rax] 1555 lea rdx, [rsi + rax*4] 1556 lea rcx, [rdx + rax] 1557 1558 movdqu xmm4, [rsi] ; 83 82 81 80 1559 movdqu xmm1, [rdx] ; c3 c2 c1 c0 1560 movdqu xmm6, [rdi] ; 93 92 91 90 1561 movdqu xmm3, [rcx] ; d3 d2 d1 d0 1562 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 1563 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 1564 1565 movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0 1566 movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0 1567 movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0 1568 movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0 1569 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 1570 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 1571 1572 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 1573 punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 1574 1575 movdqa xmm1, xmm4 1576 punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 1577 punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 1578 1579 movdqa xmm6, xmm4 1580 punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 1581 punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 1582 1583 movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1584 movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1585 movdqa xmm1, xmm0 1586 movdqa xmm3, xmm2 1587 1588 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1589 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1590 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1591 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1592 1593 ; calculate mask 1594 movdqa xmm6, xmm0 ; p1 1595 movdqa xmm7, xmm3 ; q1 1596 psubusb xmm7, xmm0 ; q1-=p1 1597 psubusb xmm6, xmm3 ; p1-=q1 1598 por xmm6, xmm7 ; abs(p1-q1) 1599 pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1600 psrlw xmm6, 1 ; abs(p1-q1)/2 1601 1602 movdqa xmm5, xmm1 ; p0 1603 movdqa xmm4, xmm2 ; q0 1604 psubusb xmm5, xmm2 ; p0-=q0 1605 psubusb xmm4, xmm1 ; q0-=p0 1606 por xmm5, xmm4 ; abs(p0 - q0) 1607 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1608 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1609 1610 mov rdx, arg(2) ;flimit 1611 movdqa xmm7, XMMWORD PTR [rdx] 1612 mov rdx, arg(3) ; get limit 1613 movdqa xmm6, XMMWORD PTR [rdx] 1614 paddb xmm7, xmm7 ; flimit*2 (less than 255) 1615 paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255) 1616 1617 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit 1618 pxor xmm7, xmm7 1619 pcmpeqb xmm5, xmm7 ; mm5 = mask 1620 1621 ; start work on filters 1622 movdqa t0, xmm0 1623 movdqa t1, xmm3 1624 1625 pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values 1626 pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values 1627 1628 psubsb xmm0, xmm3 ; p1 - q1 1629 movdqa xmm6, xmm1 ; p0 1630 1631 movdqa xmm7, xmm2 ; q0 1632 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values 1633 1634 pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values 1635 movdqa xmm3, xmm7 ; offseted ; q0 1636 1637 psubsb xmm7, xmm6 ; q0 - p0 1638 paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) 1639 1640 paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) 1641 paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) 1642 1643 pand xmm5, xmm0 ; mask filter values we don't care about 1644 1645 1646 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 1647 1648 movdqa xmm0, xmm5 ; get a copy of filters 1649 psllw xmm0, 8 ; shift left 8 1650 1651 psraw xmm0, 3 ; arithmetic shift right 11 1652 psrlw xmm0, 8 1653 1654 movdqa xmm7, xmm5 ; get a copy of filters 1655 psraw xmm7, 11 ; arithmetic shift right 11 1656 1657 psllw xmm7, 8 ; shift left 8 to put it back 1658 por xmm0, xmm7 ; put the two together to get result 1659 1660 psubsb xmm3, xmm0 ; q0-= q0sz add 1661 pxor xmm3, [GLOBAL(t80)] ; unoffset q0 1662 1663 ; now do +3 side 1664 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 1665 movdqa xmm0, xmm5 ; get a copy of filters 1666 1667 psllw xmm0, 8 ; shift left 8 1668 psraw xmm0, 3 ; arithmetic shift right 11 1669 1670 psrlw xmm0, 8 1671 psraw xmm5, 11 ; arithmetic shift right 11 1672 1673 psllw xmm5, 8 ; shift left 8 to put it back 1674 por xmm0, xmm5 ; put the two together to get result 1675 1676 paddsb xmm6, xmm0 ; p0+= p0 add 1677 pxor xmm6, [GLOBAL(t80)] ; unoffset p0 1678 1679 movdqa xmm0, t0 ; p1 1680 movdqa xmm4, t1 ; q1 1681 1682 ; transpose back to write out 1683 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1684 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1685 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1686 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1687 movdqa xmm1, xmm0 1688 punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1689 punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1690 1691 movdqa xmm5, xmm3 1692 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1693 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1694 1695 movdqa xmm2, xmm0 1696 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1697 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1698 1699 movdqa xmm3, xmm1 1700 punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1701 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1702 1703 ; write out order: xmm0 xmm2 xmm1 xmm3 1704 lea rdx, [rsi + rax*4] 1705 1706 movd [rsi], xmm1 ; write the second 8-line result 1707 psrldq xmm1, 4 1708 movd [rdi], xmm1 1709 psrldq xmm1, 4 1710 movd [rsi + rax*2], xmm1 1711 psrldq xmm1, 4 1712 movd [rdi + rax*2], xmm1 1713 1714 movd [rdx], xmm3 1715 psrldq xmm3, 4 1716 movd [rcx], xmm3 1717 psrldq xmm3, 4 1718 movd [rdx + rax*2], xmm3 1719 psrldq xmm3, 4 1720 movd [rcx + rax*2], xmm3 1721 1722 neg rax 1723 lea rsi, [rsi + rax*8] 1724 neg rax 1725 lea rdi, [rsi + rax] 1726 lea rdx, [rsi + rax*4] 1727 lea rcx, [rdx + rax] 1728 1729 movd [rsi], xmm0 ; write the first 8-line result 1730 psrldq xmm0, 4 1731 movd [rdi], xmm0 1732 psrldq xmm0, 4 1733 movd [rsi + rax*2], xmm0 1734 psrldq xmm0, 4 1735 movd [rdi + rax*2], xmm0 1736 1737 movd [rdx], xmm2 1738 psrldq xmm2, 4 1739 movd [rcx], xmm2 1740 psrldq xmm2, 4 1741 movd [rdx + rax*2], xmm2 1742 psrldq xmm2, 4 1743 movd [rcx + rax*2], xmm2 1744 1745 add rsp, 32 1746 pop rsp 1747 ; begin epilog 1748 pop rdi 1749 pop rsi 1750 RESTORE_GOT 1751 RESTORE_XMM 1752 UNSHADOW_ARGS 1753 pop rbp 1754 ret 1755 1756 SECTION_RODATA 1757 align 16 1758 tfe: 1759 times 16 db 0xfe 1760 align 16 1761 t80: 1762 times 16 db 0x80 1763 align 16 1764 t1s: 1765 times 16 db 0x01 1766 align 16 1767 t3: 1768 times 16 db 0x03 1769 align 16 1770 t4: 1771 times 16 db 0x04 1772 align 16 1773 ones: 1774 times 8 dw 0x0001 1775 align 16 1776 s9: 1777 times 8 dw 0x0900 1778 align 16 1779 s63: 1780 times 8 dw 0x003f 1781