1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 15 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) 16 global sym(vp8_dequantize_b_impl_mmx) 17 sym(vp8_dequantize_b_impl_mmx): 18 push rbp 19 mov rbp, rsp 20 SHADOW_ARGS_TO_STACK 3 21 push rsi 22 push rdi 23 ; end prolog 24 25 mov rsi, arg(0) ;sq 26 mov rdi, arg(1) ;dq 27 mov rax, arg(2) ;q 28 29 movq mm1, [rsi] 30 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. 31 movq [rdi], mm1 32 33 movq mm1, [rsi+8] 34 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. 35 movq [rdi+8], mm1 36 37 movq mm1, [rsi+16] 38 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. 39 movq [rdi+16], mm1 40 41 movq mm1, [rsi+24] 42 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. 43 movq [rdi+24], mm1 44 45 ; begin epilog 46 pop rdi 47 pop rsi 48 UNSHADOW_ARGS 49 pop rbp 50 ret 51 52 53 ;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) 54 global sym(vp8_dequant_idct_add_mmx) 55 sym(vp8_dequant_idct_add_mmx): 56 push rbp 57 mov rbp, rsp 58 SHADOW_ARGS_TO_STACK 6 59 GET_GOT rbx 60 push rsi 61 push rdi 62 ; end prolog 63 64 mov rax, arg(0) ;input 65 mov rdx, arg(1) ;dq 66 67 68 movq mm0, [rax ] 69 pmullw mm0, [rdx] 70 71 movq mm1, [rax +8] 72 pmullw mm1, [rdx +8] 73 74 movq mm2, [rax+16] 75 pmullw mm2, [rdx+16] 76 77 movq mm3, [rax+24] 78 pmullw mm3, [rdx+24] 79 80 mov rdx, arg(3) ;dest 81 mov rsi, arg(2) ;pred 82 pxor mm7, mm7 83 84 85 movq [rax], mm7 86 movq [rax+8], mm7 87 88 movq [rax+16],mm7 89 movq [rax+24],mm7 90 91 92 movsxd rax, dword ptr arg(4) ;pitch 93 movsxd rdi, dword ptr arg(5) ;stride 94 95 psubw mm0, mm2 ; b1= 0-2 96 paddw mm2, mm2 ; 97 98 movq mm5, mm1 99 paddw mm2, mm0 ; a1 =0+2 100 101 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 102 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 103 104 movq mm7, mm3 ; 105 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 106 107 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 108 psubw mm7, mm5 ; c1 109 110 movq mm5, mm1 111 movq mm4, mm3 112 113 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 114 paddw mm5, mm1 115 116 pmulhw mm3, [GLOBAL(x_s1sqr2)] 117 paddw mm3, mm4 118 119 paddw mm3, mm5 ; d1 120 movq mm6, mm2 ; a1 121 122 movq mm4, mm0 ; b1 123 paddw mm2, mm3 ;0 124 125 paddw mm4, mm7 ;1 126 psubw mm0, mm7 ;2 127 128 psubw mm6, mm3 ;3 129 130 movq mm1, mm2 ; 03 02 01 00 131 movq mm3, mm4 ; 23 22 21 20 132 133 punpcklwd mm1, mm0 ; 11 01 10 00 134 punpckhwd mm2, mm0 ; 13 03 12 02 135 136 punpcklwd mm3, mm6 ; 31 21 30 20 137 punpckhwd mm4, mm6 ; 33 23 32 22 138 139 movq mm0, mm1 ; 11 01 10 00 140 movq mm5, mm2 ; 13 03 12 02 141 142 punpckldq mm0, mm3 ; 30 20 10 00 143 punpckhdq mm1, mm3 ; 31 21 11 01 144 145 punpckldq mm2, mm4 ; 32 22 12 02 146 punpckhdq mm5, mm4 ; 33 23 13 03 147 148 movq mm3, mm5 ; 33 23 13 03 149 150 psubw mm0, mm2 ; b1= 0-2 151 paddw mm2, mm2 ; 152 153 movq mm5, mm1 154 paddw mm2, mm0 ; a1 =0+2 155 156 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 157 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 158 159 movq mm7, mm3 ; 160 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 161 162 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 163 psubw mm7, mm5 ; c1 164 165 movq mm5, mm1 166 movq mm4, mm3 167 168 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 169 paddw mm5, mm1 170 171 pmulhw mm3, [GLOBAL(x_s1sqr2)] 172 paddw mm3, mm4 173 174 paddw mm3, mm5 ; d1 175 paddw mm0, [GLOBAL(fours)] 176 177 paddw mm2, [GLOBAL(fours)] 178 movq mm6, mm2 ; a1 179 180 movq mm4, mm0 ; b1 181 paddw mm2, mm3 ;0 182 183 paddw mm4, mm7 ;1 184 psubw mm0, mm7 ;2 185 186 psubw mm6, mm3 ;3 187 psraw mm2, 3 188 189 psraw mm0, 3 190 psraw mm4, 3 191 192 psraw mm6, 3 193 194 movq mm1, mm2 ; 03 02 01 00 195 movq mm3, mm4 ; 23 22 21 20 196 197 punpcklwd mm1, mm0 ; 11 01 10 00 198 punpckhwd mm2, mm0 ; 13 03 12 02 199 200 punpcklwd mm3, mm6 ; 31 21 30 20 201 punpckhwd mm4, mm6 ; 33 23 32 22 202 203 movq mm0, mm1 ; 11 01 10 00 204 movq mm5, mm2 ; 13 03 12 02 205 206 punpckldq mm0, mm3 ; 30 20 10 00 207 punpckhdq mm1, mm3 ; 31 21 11 01 208 209 punpckldq mm2, mm4 ; 32 22 12 02 210 punpckhdq mm5, mm4 ; 33 23 13 03 211 212 pxor mm7, mm7 213 214 movd mm4, [rsi] 215 punpcklbw mm4, mm7 216 paddsw mm0, mm4 217 packuswb mm0, mm7 218 movd [rdx], mm0 219 220 movd mm4, [rsi+rax] 221 punpcklbw mm4, mm7 222 paddsw mm1, mm4 223 packuswb mm1, mm7 224 movd [rdx+rdi], mm1 225 226 movd mm4, [rsi+2*rax] 227 punpcklbw mm4, mm7 228 paddsw mm2, mm4 229 packuswb mm2, mm7 230 movd [rdx+rdi*2], mm2 231 232 add rdx, rdi 233 add rsi, rax 234 235 movd mm4, [rsi+2*rax] 236 punpcklbw mm4, mm7 237 paddsw mm5, mm4 238 packuswb mm5, mm7 239 movd [rdx+rdi*2], mm5 240 241 ; begin epilog 242 pop rdi 243 pop rsi 244 RESTORE_GOT 245 UNSHADOW_ARGS 246 pop rbp 247 ret 248 249 250 ;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc) 251 global sym(vp8_dequant_dc_idct_add_mmx) 252 sym(vp8_dequant_dc_idct_add_mmx): 253 push rbp 254 mov rbp, rsp 255 SHADOW_ARGS_TO_STACK 7 256 GET_GOT rbx 257 push rsi 258 push rdi 259 ; end prolog 260 261 mov rax, arg(0) ;input 262 mov rdx, arg(1) ;dq 263 264 movq mm0, [rax ] 265 pmullw mm0, [rdx] 266 267 movq mm1, [rax +8] 268 pmullw mm1, [rdx +8] 269 270 movq mm2, [rax+16] 271 pmullw mm2, [rdx+16] 272 273 movq mm3, [rax+24] 274 pmullw mm3, [rdx+24] 275 276 mov rdx, arg(3) ;dest 277 mov rsi, arg(2) ;pred 278 pxor mm7, mm7 279 280 281 movq [rax], mm7 282 movq [rax+8], mm7 283 284 movq [rax+16],mm7 285 movq [rax+24],mm7 286 287 ; move lower word of Dc to lower word of mm0 288 psrlq mm0, 16 289 movzx rcx, word ptr arg(6) ;Dc 290 psllq mm0, 16 291 movq mm7, rcx 292 por mm0, mm7 293 294 movsxd rax, dword ptr arg(4) ;pitch 295 movsxd rdi, dword ptr arg(5) ;stride 296 297 psubw mm0, mm2 ; b1= 0-2 298 paddw mm2, mm2 ; 299 300 movq mm5, mm1 301 paddw mm2, mm0 ; a1 =0+2 302 303 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 304 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 305 306 movq mm7, mm3 ; 307 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 308 309 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 310 psubw mm7, mm5 ; c1 311 312 movq mm5, mm1 313 movq mm4, mm3 314 315 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 316 paddw mm5, mm1 317 318 pmulhw mm3, [GLOBAL(x_s1sqr2)] 319 paddw mm3, mm4 320 321 paddw mm3, mm5 ; d1 322 movq mm6, mm2 ; a1 323 324 movq mm4, mm0 ; b1 325 paddw mm2, mm3 ;0 326 327 paddw mm4, mm7 ;1 328 psubw mm0, mm7 ;2 329 330 psubw mm6, mm3 ;3 331 332 movq mm1, mm2 ; 03 02 01 00 333 movq mm3, mm4 ; 23 22 21 20 334 335 punpcklwd mm1, mm0 ; 11 01 10 00 336 punpckhwd mm2, mm0 ; 13 03 12 02 337 338 punpcklwd mm3, mm6 ; 31 21 30 20 339 punpckhwd mm4, mm6 ; 33 23 32 22 340 341 movq mm0, mm1 ; 11 01 10 00 342 movq mm5, mm2 ; 13 03 12 02 343 344 punpckldq mm0, mm3 ; 30 20 10 00 345 punpckhdq mm1, mm3 ; 31 21 11 01 346 347 punpckldq mm2, mm4 ; 32 22 12 02 348 punpckhdq mm5, mm4 ; 33 23 13 03 349 350 movq mm3, mm5 ; 33 23 13 03 351 352 psubw mm0, mm2 ; b1= 0-2 353 paddw mm2, mm2 ; 354 355 movq mm5, mm1 356 paddw mm2, mm0 ; a1 =0+2 357 358 pmulhw mm5, [GLOBAL(x_s1sqr2)]; 359 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 360 361 movq mm7, mm3 ; 362 pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 363 364 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 365 psubw mm7, mm5 ; c1 366 367 movq mm5, mm1 368 movq mm4, mm3 369 370 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 371 paddw mm5, mm1 372 373 pmulhw mm3, [GLOBAL(x_s1sqr2)] 374 paddw mm3, mm4 375 376 paddw mm3, mm5 ; d1 377 paddw mm0, [GLOBAL(fours)] 378 379 paddw mm2, [GLOBAL(fours)] 380 movq mm6, mm2 ; a1 381 382 movq mm4, mm0 ; b1 383 paddw mm2, mm3 ;0 384 385 paddw mm4, mm7 ;1 386 psubw mm0, mm7 ;2 387 388 psubw mm6, mm3 ;3 389 psraw mm2, 3 390 391 psraw mm0, 3 392 psraw mm4, 3 393 394 psraw mm6, 3 395 396 movq mm1, mm2 ; 03 02 01 00 397 movq mm3, mm4 ; 23 22 21 20 398 399 punpcklwd mm1, mm0 ; 11 01 10 00 400 punpckhwd mm2, mm0 ; 13 03 12 02 401 402 punpcklwd mm3, mm6 ; 31 21 30 20 403 punpckhwd mm4, mm6 ; 33 23 32 22 404 405 movq mm0, mm1 ; 11 01 10 00 406 movq mm5, mm2 ; 13 03 12 02 407 408 punpckldq mm0, mm3 ; 30 20 10 00 409 punpckhdq mm1, mm3 ; 31 21 11 01 410 411 punpckldq mm2, mm4 ; 32 22 12 02 412 punpckhdq mm5, mm4 ; 33 23 13 03 413 414 pxor mm7, mm7 415 416 movd mm4, [rsi] 417 punpcklbw mm4, mm7 418 paddsw mm0, mm4 419 packuswb mm0, mm7 420 movd [rdx], mm0 421 422 movd mm4, [rsi+rax] 423 punpcklbw mm4, mm7 424 paddsw mm1, mm4 425 packuswb mm1, mm7 426 movd [rdx+rdi], mm1 427 428 movd mm4, [rsi+2*rax] 429 punpcklbw mm4, mm7 430 paddsw mm2, mm4 431 packuswb mm2, mm7 432 movd [rdx+rdi*2], mm2 433 434 add rdx, rdi 435 add rsi, rax 436 437 movd mm4, [rsi+2*rax] 438 punpcklbw mm4, mm7 439 paddsw mm5, mm4 440 packuswb mm5, mm7 441 movd [rdx+rdi*2], mm5 442 443 ; begin epilog 444 pop rdi 445 pop rsi 446 RESTORE_GOT 447 UNSHADOW_ARGS 448 pop rbp 449 ret 450 451 452 SECTION_RODATA 453 align 16 454 x_s1sqr2: 455 times 4 dw 0x8A8C 456 align 16 457 x_c1sqr2less1: 458 times 4 dw 0x4E7B 459 align 16 460 fours: 461 times 4 dw 0x0004 462