1 ; 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %include "vpx_ports/x86_abi_support.asm" 12 13 %macro HIGH_GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov rcx, 0x00000040 18 19 movdqa xmm3, [rdx] ;load filters 20 pshuflw xmm4, xmm3, 11111111b ;k3 21 psrldq xmm3, 8 22 pshuflw xmm3, xmm3, 0b ;k4 23 punpcklwd xmm4, xmm3 ;k3k4 24 25 movq xmm3, rcx ;rounding 26 pshufd xmm3, xmm3, 0 27 28 mov rdx, 0x00010001 29 movsxd rcx, DWORD PTR arg(6) ;bps 30 movq xmm5, rdx 31 movq xmm2, rcx 32 pshufd xmm5, xmm5, 0b 33 movdqa xmm1, xmm5 34 psllw xmm5, xmm2 35 psubw xmm5, xmm1 ;max value (for clamping) 36 pxor xmm2, xmm2 ;min value (for clamping) 37 38 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 39 movsxd rdx, DWORD PTR arg(3) ;out_pitch 40 movsxd rcx, DWORD PTR arg(4) ;output_height 41 %endm 42 43 %macro HIGH_APPLY_FILTER_4 1 44 45 punpcklwd xmm0, xmm1 ;two row in one register 46 pmaddwd xmm0, xmm4 ;multiply the filter factors 47 48 paddd xmm0, xmm3 ;rounding 49 psrad xmm0, 7 ;shift 50 packssdw xmm0, xmm0 ;pack to word 51 52 ;clamp the values 53 pminsw xmm0, xmm5 54 pmaxsw xmm0, xmm2 55 56 %if %1 57 movq xmm1, [rdi] 58 pavgw xmm0, xmm1 59 %endif 60 61 movq [rdi], xmm0 62 lea rsi, [rsi + 2*rax] 63 lea rdi, [rdi + 2*rdx] 64 dec rcx 65 %endm 66 67 %if ARCH_X86_64 68 %macro HIGH_GET_PARAM 0 69 mov rdx, arg(5) ;filter ptr 70 mov rsi, arg(0) ;src_ptr 71 mov rdi, arg(2) ;output_ptr 72 mov rcx, 0x00000040 73 74 movdqa xmm6, [rdx] ;load filters 75 76 pshuflw xmm7, xmm6, 11111111b ;k3 77 pshufhw xmm6, xmm6, 0b ;k4 78 psrldq xmm6, 8 79 punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 80 81 movq xmm4, rcx ;rounding 82 pshufd xmm4, xmm4, 0 83 84 mov rdx, 0x00010001 85 movsxd rcx, DWORD PTR arg(6) ;bps 86 movq xmm8, rdx 87 movq xmm5, rcx 88 pshufd xmm8, xmm8, 0b 89 movdqa xmm1, xmm8 90 psllw xmm8, xmm5 91 psubw xmm8, xmm1 ;max value (for clamping) 92 pxor xmm5, xmm5 ;min value (for clamping) 93 94 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 95 movsxd rdx, DWORD PTR arg(3) ;out_pitch 96 movsxd rcx, DWORD PTR arg(4) ;output_height 97 %endm 98 99 %macro HIGH_APPLY_FILTER_8 1 100 movdqa xmm6, xmm0 101 punpckhwd xmm6, xmm1 102 punpcklwd xmm0, xmm1 103 pmaddwd xmm6, xmm7 104 pmaddwd xmm0, xmm7 105 106 paddd xmm6, xmm4 ;rounding 107 paddd xmm0, xmm4 ;rounding 108 psrad xmm6, 7 ;shift 109 psrad xmm0, 7 ;shift 110 packssdw xmm0, xmm6 ;pack back to word 111 112 ;clamp the values 113 pminsw xmm0, xmm8 114 pmaxsw xmm0, xmm5 115 116 %if %1 117 movdqu xmm1, [rdi] 118 pavgw xmm0, xmm1 119 %endif 120 movdqu [rdi], xmm0 ;store the result 121 122 lea rsi, [rsi + 2*rax] 123 lea rdi, [rdi + 2*rdx] 124 dec rcx 125 %endm 126 127 %macro HIGH_APPLY_FILTER_16 1 128 movdqa xmm9, xmm0 129 movdqa xmm6, xmm2 130 punpckhwd xmm9, xmm1 131 punpckhwd xmm6, xmm3 132 punpcklwd xmm0, xmm1 133 punpcklwd xmm2, xmm3 134 135 pmaddwd xmm9, xmm7 136 pmaddwd xmm6, xmm7 137 pmaddwd xmm0, xmm7 138 pmaddwd xmm2, xmm7 139 140 paddd xmm9, xmm4 ;rounding 141 paddd xmm6, xmm4 142 paddd xmm0, xmm4 143 paddd xmm2, xmm4 144 145 psrad xmm9, 7 ;shift 146 psrad xmm6, 7 147 psrad xmm0, 7 148 psrad xmm2, 7 149 150 packssdw xmm0, xmm9 ;pack back to word 151 packssdw xmm2, xmm6 ;pack back to word 152 153 ;clamp the values 154 pminsw xmm0, xmm8 155 pmaxsw xmm0, xmm5 156 pminsw xmm2, xmm8 157 pmaxsw xmm2, xmm5 158 159 %if %1 160 movdqu xmm1, [rdi] 161 movdqu xmm3, [rdi + 16] 162 pavgw xmm0, xmm1 163 pavgw xmm2, xmm3 164 %endif 165 movdqu [rdi], xmm0 ;store the result 166 movdqu [rdi + 16], xmm2 ;store the result 167 168 lea rsi, [rsi + 2*rax] 169 lea rdi, [rdi + 2*rdx] 170 dec rcx 171 %endm 172 %endif 173 174 SECTION .text 175 176 global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE 177 sym(vpx_highbd_filter_block1d4_v2_sse2): 178 push rbp 179 mov rbp, rsp 180 SHADOW_ARGS_TO_STACK 7 181 push rsi 182 push rdi 183 ; end prolog 184 185 HIGH_GET_PARAM_4 186 .loop: 187 movq xmm0, [rsi] ;load src 188 movq xmm1, [rsi + 2*rax] 189 190 HIGH_APPLY_FILTER_4 0 191 jnz .loop 192 193 ; begin epilog 194 pop rdi 195 pop rsi 196 UNSHADOW_ARGS 197 pop rbp 198 ret 199 200 %if ARCH_X86_64 201 global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE 202 sym(vpx_highbd_filter_block1d8_v2_sse2): 203 push rbp 204 mov rbp, rsp 205 SHADOW_ARGS_TO_STACK 7 206 SAVE_XMM 8 207 push rsi 208 push rdi 209 ; end prolog 210 211 HIGH_GET_PARAM 212 .loop: 213 movdqu xmm0, [rsi] ;0 214 movdqu xmm1, [rsi + 2*rax] ;1 215 216 HIGH_APPLY_FILTER_8 0 217 jnz .loop 218 219 ; begin epilog 220 pop rdi 221 pop rsi 222 RESTORE_XMM 223 UNSHADOW_ARGS 224 pop rbp 225 ret 226 227 global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE 228 sym(vpx_highbd_filter_block1d16_v2_sse2): 229 push rbp 230 mov rbp, rsp 231 SHADOW_ARGS_TO_STACK 7 232 SAVE_XMM 9 233 push rsi 234 push rdi 235 ; end prolog 236 237 HIGH_GET_PARAM 238 .loop: 239 movdqu xmm0, [rsi] ;0 240 movdqu xmm2, [rsi + 16] 241 movdqu xmm1, [rsi + 2*rax] ;1 242 movdqu xmm3, [rsi + 2*rax + 16] 243 244 HIGH_APPLY_FILTER_16 0 245 jnz .loop 246 247 ; begin epilog 248 pop rdi 249 pop rsi 250 RESTORE_XMM 251 UNSHADOW_ARGS 252 pop rbp 253 ret 254 %endif 255 256 global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE 257 sym(vpx_highbd_filter_block1d4_v2_avg_sse2): 258 push rbp 259 mov rbp, rsp 260 SHADOW_ARGS_TO_STACK 7 261 push rsi 262 push rdi 263 ; end prolog 264 265 HIGH_GET_PARAM_4 266 .loop: 267 movq xmm0, [rsi] ;load src 268 movq xmm1, [rsi + 2*rax] 269 270 HIGH_APPLY_FILTER_4 1 271 jnz .loop 272 273 ; begin epilog 274 pop rdi 275 pop rsi 276 UNSHADOW_ARGS 277 pop rbp 278 ret 279 280 %if ARCH_X86_64 281 global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE 282 sym(vpx_highbd_filter_block1d8_v2_avg_sse2): 283 push rbp 284 mov rbp, rsp 285 SHADOW_ARGS_TO_STACK 7 286 SAVE_XMM 8 287 push rsi 288 push rdi 289 ; end prolog 290 291 HIGH_GET_PARAM 292 .loop: 293 movdqu xmm0, [rsi] ;0 294 movdqu xmm1, [rsi + 2*rax] ;1 295 296 HIGH_APPLY_FILTER_8 1 297 jnz .loop 298 299 ; begin epilog 300 pop rdi 301 pop rsi 302 RESTORE_XMM 303 UNSHADOW_ARGS 304 pop rbp 305 ret 306 307 global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE 308 sym(vpx_highbd_filter_block1d16_v2_avg_sse2): 309 push rbp 310 mov rbp, rsp 311 SHADOW_ARGS_TO_STACK 7 312 SAVE_XMM 9 313 push rsi 314 push rdi 315 ; end prolog 316 317 HIGH_GET_PARAM 318 .loop: 319 movdqu xmm0, [rsi] ;0 320 movdqu xmm1, [rsi + 2*rax] ;1 321 movdqu xmm2, [rsi + 16] 322 movdqu xmm3, [rsi + 2*rax + 16] 323 324 HIGH_APPLY_FILTER_16 1 325 jnz .loop 326 327 ; begin epilog 328 pop rdi 329 pop rsi 330 RESTORE_XMM 331 UNSHADOW_ARGS 332 pop rbp 333 ret 334 %endif 335 336 global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE 337 sym(vpx_highbd_filter_block1d4_h2_sse2): 338 push rbp 339 mov rbp, rsp 340 SHADOW_ARGS_TO_STACK 7 341 push rsi 342 push rdi 343 ; end prolog 344 345 HIGH_GET_PARAM_4 346 .loop: 347 movdqu xmm0, [rsi] ;load src 348 movdqa xmm1, xmm0 349 psrldq xmm1, 2 350 351 HIGH_APPLY_FILTER_4 0 352 jnz .loop 353 354 ; begin epilog 355 pop rdi 356 pop rsi 357 UNSHADOW_ARGS 358 pop rbp 359 ret 360 361 %if ARCH_X86_64 362 global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE 363 sym(vpx_highbd_filter_block1d8_h2_sse2): 364 push rbp 365 mov rbp, rsp 366 SHADOW_ARGS_TO_STACK 7 367 SAVE_XMM 8 368 push rsi 369 push rdi 370 ; end prolog 371 372 HIGH_GET_PARAM 373 .loop: 374 movdqu xmm0, [rsi] ;load src 375 movdqu xmm1, [rsi + 2] 376 377 HIGH_APPLY_FILTER_8 0 378 jnz .loop 379 380 ; begin epilog 381 pop rdi 382 pop rsi 383 RESTORE_XMM 384 UNSHADOW_ARGS 385 pop rbp 386 ret 387 388 global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE 389 sym(vpx_highbd_filter_block1d16_h2_sse2): 390 push rbp 391 mov rbp, rsp 392 SHADOW_ARGS_TO_STACK 7 393 SAVE_XMM 9 394 push rsi 395 push rdi 396 ; end prolog 397 398 HIGH_GET_PARAM 399 .loop: 400 movdqu xmm0, [rsi] ;load src 401 movdqu xmm1, [rsi + 2] 402 movdqu xmm2, [rsi + 16] 403 movdqu xmm3, [rsi + 18] 404 405 HIGH_APPLY_FILTER_16 0 406 jnz .loop 407 408 ; begin epilog 409 pop rdi 410 pop rsi 411 RESTORE_XMM 412 UNSHADOW_ARGS 413 pop rbp 414 ret 415 %endif 416 417 global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE 418 sym(vpx_highbd_filter_block1d4_h2_avg_sse2): 419 push rbp 420 mov rbp, rsp 421 SHADOW_ARGS_TO_STACK 7 422 push rsi 423 push rdi 424 ; end prolog 425 426 HIGH_GET_PARAM_4 427 .loop: 428 movdqu xmm0, [rsi] ;load src 429 movdqa xmm1, xmm0 430 psrldq xmm1, 2 431 432 HIGH_APPLY_FILTER_4 1 433 jnz .loop 434 435 ; begin epilog 436 pop rdi 437 pop rsi 438 UNSHADOW_ARGS 439 pop rbp 440 ret 441 442 %if ARCH_X86_64 443 global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE 444 sym(vpx_highbd_filter_block1d8_h2_avg_sse2): 445 push rbp 446 mov rbp, rsp 447 SHADOW_ARGS_TO_STACK 7 448 SAVE_XMM 8 449 push rsi 450 push rdi 451 ; end prolog 452 453 HIGH_GET_PARAM 454 .loop: 455 movdqu xmm0, [rsi] ;load src 456 movdqu xmm1, [rsi + 2] 457 458 HIGH_APPLY_FILTER_8 1 459 jnz .loop 460 461 ; begin epilog 462 pop rdi 463 pop rsi 464 RESTORE_XMM 465 UNSHADOW_ARGS 466 pop rbp 467 ret 468 469 global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE 470 sym(vpx_highbd_filter_block1d16_h2_avg_sse2): 471 push rbp 472 mov rbp, rsp 473 SHADOW_ARGS_TO_STACK 7 474 SAVE_XMM 9 475 push rsi 476 push rdi 477 ; end prolog 478 479 HIGH_GET_PARAM 480 .loop: 481 movdqu xmm0, [rsi] ;load src 482 movdqu xmm1, [rsi + 2] 483 movdqu xmm2, [rsi + 16] 484 movdqu xmm3, [rsi + 18] 485 486 HIGH_APPLY_FILTER_16 1 487 jnz .loop 488 489 ; begin epilog 490 pop rdi 491 pop rsi 492 RESTORE_XMM 493 UNSHADOW_ARGS 494 pop rbp 495 ret 496 %endif 497