1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 extern sym(vp8_bilinear_filters_x86_8) 14 15 16 %define BLOCK_HEIGHT_WIDTH 4 17 %define vp8_filter_weight 128 18 %define VP8_FILTER_SHIFT 7 19 20 SECTION .text 21 22 ;void vp8_filter_block1d_h6_mmx 23 ;( 24 ; unsigned char *src_ptr, 25 ; unsigned short *output_ptr, 26 ; unsigned int src_pixels_per_line, 27 ; unsigned int pixel_step, 28 ; unsigned int output_height, 29 ; unsigned int output_width, 30 ; short * vp8_filter 31 ;) 32 global sym(vp8_filter_block1d_h6_mmx) PRIVATE 33 sym(vp8_filter_block1d_h6_mmx): 34 push rbp 35 mov rbp, rsp 36 SHADOW_ARGS_TO_STACK 7 37 GET_GOT rbx 38 push rsi 39 push rdi 40 ; end prolog 41 42 mov rdx, arg(6) ;vp8_filter 43 44 movq mm1, [rdx + 16] ; do both the negative taps first!!! 45 movq mm2, [rdx + 32] ; 46 movq mm6, [rdx + 48] ; 47 movq mm7, [rdx + 64] ; 48 49 mov rdi, arg(1) ;output_ptr 50 mov rsi, arg(0) ;src_ptr 51 movsxd rcx, dword ptr arg(4) ;output_height 52 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 53 pxor mm0, mm0 ; mm0 = 00000000 54 55 .nextrow: 56 movq mm3, [rsi-2] ; mm3 = p-2..p5 57 movq mm4, mm3 ; mm4 = p-2..p5 58 psrlq mm3, 8 ; mm3 = p-1..p5 59 punpcklbw mm3, mm0 ; mm3 = p-1..p2 60 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 61 62 movq mm5, mm4 ; mm5 = p-2..p5 63 punpckhbw mm4, mm0 ; mm5 = p2..p5 64 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 65 paddsw mm3, mm4 ; mm3 += mm5 66 67 movq mm4, mm5 ; mm4 = p-2..p5; 68 psrlq mm5, 16 ; mm5 = p0..p5; 69 punpcklbw mm5, mm0 ; mm5 = p0..p3 70 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 71 paddsw mm3, mm5 ; mm3 += mm5 72 73 movq mm5, mm4 ; mm5 = p-2..p5 74 psrlq mm4, 24 ; mm4 = p1..p5 75 punpcklbw mm4, mm0 ; mm4 = p1..p4 76 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 77 paddsw mm3, mm4 ; mm3 += mm5 78 79 ; do outer positive taps 80 movd mm4, [rsi+3] 81 punpcklbw mm4, mm0 ; mm5 = p3..p6 82 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 83 paddsw mm3, mm4 ; mm3 += mm5 84 85 punpcklbw mm5, mm0 ; mm5 = p-2..p1 86 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 87 paddsw mm3, mm5 ; mm3 += mm5 88 89 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 90 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 91 packuswb mm3, mm0 ; pack and unpack to saturate 92 punpcklbw mm3, mm0 ; 93 94 movq [rdi], mm3 ; store the results in the destination 95 96 %if ABI_IS_32BIT 97 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 98 add rdi, rax; 99 %else 100 movsxd r8, dword ptr arg(2) ;src_pixels_per_line 101 add rdi, rax; 102 103 add rsi, r8 ; next line 104 %endif 105 106 dec rcx ; decrement count 107 jnz .nextrow ; next row 108 109 ; begin epilog 110 pop rdi 111 pop rsi 112 RESTORE_GOT 113 UNSHADOW_ARGS 114 pop rbp 115 ret 116 117 118 ;void vp8_filter_block1dc_v6_mmx 119 ;( 120 ; short *src_ptr, 121 ; unsigned char *output_ptr, 122 ; int output_pitch, 123 ; unsigned int pixels_per_line, 124 ; unsigned int pixel_step, 125 ; unsigned int output_height, 126 ; unsigned int output_width, 127 ; short * vp8_filter 128 ;) 129 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE 130 sym(vp8_filter_block1dc_v6_mmx): 131 push rbp 132 mov rbp, rsp 133 SHADOW_ARGS_TO_STACK 8 134 GET_GOT rbx 135 push rsi 136 push rdi 137 ; end prolog 138 139 movq mm5, [GLOBAL(rd)] 140 push rbx 141 mov rbx, arg(7) ;vp8_filter 142 movq mm1, [rbx + 16] ; do both the negative taps first!!! 143 movq mm2, [rbx + 32] ; 144 movq mm6, [rbx + 48] ; 145 movq mm7, [rbx + 64] ; 146 147 movsxd rdx, dword ptr arg(3) ;pixels_per_line 148 mov rdi, arg(1) ;output_ptr 149 mov rsi, arg(0) ;src_ptr 150 sub rsi, rdx 151 sub rsi, rdx 152 movsxd rcx, DWORD PTR arg(5) ;output_height 153 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 154 pxor mm0, mm0 ; mm0 = 00000000 155 156 157 .nextrow_cv: 158 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 159 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 160 161 162 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 163 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 164 paddsw mm3, mm4 ; mm3 += mm4 165 166 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 167 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 168 paddsw mm3, mm4 ; mm3 += mm4 169 170 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 171 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 172 paddsw mm3, mm4 ; mm3 += mm4 173 174 175 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 176 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 177 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 178 paddsw mm3, mm4 ; mm3 += mm4 179 180 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 181 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 182 paddsw mm3, mm4 ; mm3 += mm4 183 184 185 paddsw mm3, mm5 ; mm3 += round value 186 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 187 packuswb mm3, mm0 ; pack and saturate 188 189 movd [rdi],mm3 ; store the results in the destination 190 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 191 ; recon block should be in cache this shouldn't cost much. Its obviously 192 ; avoidable!!!. 193 lea rdi, [rdi+rax] ; 194 dec rcx ; decrement count 195 jnz .nextrow_cv ; next row 196 197 pop rbx 198 199 ; begin epilog 200 pop rdi 201 pop rsi 202 RESTORE_GOT 203 UNSHADOW_ARGS 204 pop rbp 205 ret 206 207 208 ;void bilinear_predict8x4_mmx 209 ;( 210 ; unsigned char *src_ptr, 211 ; int src_pixels_per_line, 212 ; int xoffset, 213 ; int yoffset, 214 ; unsigned char *dst_ptr, 215 ; int dst_pitch 216 ;) 217 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE 218 sym(vp8_bilinear_predict8x4_mmx): 219 push rbp 220 mov rbp, rsp 221 SHADOW_ARGS_TO_STACK 6 222 GET_GOT rbx 223 push rsi 224 push rdi 225 ; end prolog 226 227 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 228 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 229 230 movsxd rax, dword ptr arg(2) ;xoffset 231 mov rdi, arg(4) ;dst_ptr ; 232 233 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 234 shl rax, 5 235 236 mov rsi, arg(0) ;src_ptr ; 237 add rax, rcx 238 239 movsxd rdx, dword ptr arg(5) ;dst_pitch 240 movq mm1, [rax] ; 241 242 movq mm2, [rax+16] ; 243 movsxd rax, dword ptr arg(3) ;yoffset 244 245 pxor mm0, mm0 ; 246 shl rax, 5 247 248 add rax, rcx 249 lea rcx, [rdi+rdx*4] ; 250 251 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 252 253 ; get the first horizontal line done ; 254 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 255 movq mm4, mm3 ; make a copy of current line 256 257 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 258 punpckhbw mm4, mm0 ; 259 260 pmullw mm3, mm1 ; 261 pmullw mm4, mm1 ; 262 263 movq mm5, [rsi+1] ; 264 movq mm6, mm5 ; 265 266 punpcklbw mm5, mm0 ; 267 punpckhbw mm6, mm0 ; 268 269 pmullw mm5, mm2 ; 270 pmullw mm6, mm2 ; 271 272 paddw mm3, mm5 ; 273 paddw mm4, mm6 ; 274 275 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 276 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 277 278 paddw mm4, [GLOBAL(rd)] ; 279 psraw mm4, VP8_FILTER_SHIFT ; 280 281 movq mm7, mm3 ; 282 packuswb mm7, mm4 ; 283 284 add rsi, rdx ; next line 285 .next_row_8x4: 286 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 287 movq mm4, mm3 ; make a copy of current line 288 289 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 290 punpckhbw mm4, mm0 ; 291 292 pmullw mm3, mm1 ; 293 pmullw mm4, mm1 ; 294 295 movq mm5, [rsi+1] ; 296 movq mm6, mm5 ; 297 298 punpcklbw mm5, mm0 ; 299 punpckhbw mm6, mm0 ; 300 301 pmullw mm5, mm2 ; 302 pmullw mm6, mm2 ; 303 304 paddw mm3, mm5 ; 305 paddw mm4, mm6 ; 306 307 movq mm5, mm7 ; 308 movq mm6, mm7 ; 309 310 punpcklbw mm5, mm0 ; 311 punpckhbw mm6, mm0 312 313 pmullw mm5, [rax] ; 314 pmullw mm6, [rax] ; 315 316 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 317 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 318 319 paddw mm4, [GLOBAL(rd)] ; 320 psraw mm4, VP8_FILTER_SHIFT ; 321 322 movq mm7, mm3 ; 323 packuswb mm7, mm4 ; 324 325 326 pmullw mm3, [rax+16] ; 327 pmullw mm4, [rax+16] ; 328 329 paddw mm3, mm5 ; 330 paddw mm4, mm6 ; 331 332 333 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 334 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 335 336 paddw mm4, [GLOBAL(rd)] ; 337 psraw mm4, VP8_FILTER_SHIFT ; 338 339 packuswb mm3, mm4 340 341 movq [rdi], mm3 ; store the results in the destination 342 343 %if ABI_IS_32BIT 344 add rsi, rdx ; next line 345 add rdi, dword ptr arg(5) ;dst_pitch ; 346 %else 347 movsxd r8, dword ptr arg(5) ;dst_pitch 348 add rsi, rdx ; next line 349 add rdi, r8 350 %endif 351 cmp rdi, rcx ; 352 jne .next_row_8x4 353 354 ; begin epilog 355 pop rdi 356 pop rsi 357 RESTORE_GOT 358 UNSHADOW_ARGS 359 pop rbp 360 ret 361 362 363 ;void bilinear_predict4x4_mmx 364 ;( 365 ; unsigned char *src_ptr, 366 ; int src_pixels_per_line, 367 ; int xoffset, 368 ; int yoffset, 369 ; unsigned char *dst_ptr, 370 ; int dst_pitch 371 ;) 372 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE 373 sym(vp8_bilinear_predict4x4_mmx): 374 push rbp 375 mov rbp, rsp 376 SHADOW_ARGS_TO_STACK 6 377 GET_GOT rbx 378 push rsi 379 push rdi 380 ; end prolog 381 382 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 383 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 384 385 movsxd rax, dword ptr arg(2) ;xoffset 386 mov rdi, arg(4) ;dst_ptr ; 387 388 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 389 shl rax, 5 390 391 add rax, rcx ; HFilter 392 mov rsi, arg(0) ;src_ptr ; 393 394 movsxd rdx, dword ptr arg(5) ;ldst_pitch 395 movq mm1, [rax] ; 396 397 movq mm2, [rax+16] ; 398 movsxd rax, dword ptr arg(3) ;yoffset 399 400 pxor mm0, mm0 ; 401 shl rax, 5 402 403 add rax, rcx 404 lea rcx, [rdi+rdx*4] ; 405 406 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 407 408 ; get the first horizontal line done ; 409 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 410 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 411 412 pmullw mm3, mm1 ; 413 movd mm5, [rsi+1] ; 414 415 punpcklbw mm5, mm0 ; 416 pmullw mm5, mm2 ; 417 418 paddw mm3, mm5 ; 419 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 420 421 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 422 423 movq mm7, mm3 ; 424 packuswb mm7, mm0 ; 425 426 add rsi, rdx ; next line 427 .next_row_4x4: 428 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 429 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 430 431 pmullw mm3, mm1 ; 432 movd mm5, [rsi+1] ; 433 434 punpcklbw mm5, mm0 ; 435 pmullw mm5, mm2 ; 436 437 paddw mm3, mm5 ; 438 439 movq mm5, mm7 ; 440 punpcklbw mm5, mm0 ; 441 442 pmullw mm5, [rax] ; 443 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 444 445 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 446 movq mm7, mm3 ; 447 448 packuswb mm7, mm0 ; 449 450 pmullw mm3, [rax+16] ; 451 paddw mm3, mm5 ; 452 453 454 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 455 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 456 457 packuswb mm3, mm0 458 movd [rdi], mm3 ; store the results in the destination 459 460 %if ABI_IS_32BIT 461 add rsi, rdx ; next line 462 add rdi, dword ptr arg(5) ;dst_pitch ; 463 %else 464 movsxd r8, dword ptr arg(5) ;dst_pitch ; 465 add rsi, rdx ; next line 466 add rdi, r8 467 %endif 468 469 cmp rdi, rcx ; 470 jne .next_row_4x4 471 472 ; begin epilog 473 pop rdi 474 pop rsi 475 RESTORE_GOT 476 UNSHADOW_ARGS 477 pop rbp 478 ret 479 480 481 482 SECTION_RODATA 483 align 16 484 rd: 485 times 4 dw 0x40 486 487 align 16 488 global HIDDEN_DATA(sym(vp8_six_tap_x86)) 489 sym(vp8_six_tap_x86): 490 times 8 dw 0 491 times 8 dw 0 492 times 8 dw 128 493 times 8 dw 0 494 times 8 dw 0 495 times 8 dw 0 496 497 times 8 dw 0 498 times 8 dw -6 499 times 8 dw 123 500 times 8 dw 12 501 times 8 dw -1 502 times 8 dw 0 503 504 times 8 dw 2 505 times 8 dw -11 506 times 8 dw 108 507 times 8 dw 36 508 times 8 dw -8 509 times 8 dw 1 510 511 times 8 dw 0 512 times 8 dw -9 513 times 8 dw 93 514 times 8 dw 50 515 times 8 dw -6 516 times 8 dw 0 517 518 times 8 dw 3 519 times 8 dw -16 520 times 8 dw 77 521 times 8 dw 77 522 times 8 dw -16 523 times 8 dw 3 524 525 times 8 dw 0 526 times 8 dw -6 527 times 8 dw 50 528 times 8 dw 93 529 times 8 dw -9 530 times 8 dw 0 531 532 times 8 dw 1 533 times 8 dw -8 534 times 8 dw 36 535 times 8 dw 108 536 times 8 dw -11 537 times 8 dw 2 538 539 times 8 dw 0 540 times 8 dw -1 541 times 8 dw 12 542 times 8 dw 123 543 times 8 dw -6 544 times 8 dw 0 545 546 547