1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %macro STACK_FRAME_CREATE 0 15 %if ABI_IS_32BIT 16 %define input rsi 17 %define output rdi 18 %define pitch rax 19 push rbp 20 mov rbp, rsp 21 GET_GOT rbx 22 push rsi 23 push rdi 24 ; end prolog 25 26 mov rsi, arg(0) 27 mov rdi, arg(1) 28 29 movsxd rax, dword ptr arg(2) 30 lea rcx, [rsi + rax*2] 31 %else 32 %if LIBVPX_YASM_WIN64 33 %define input rcx 34 %define output rdx 35 %define pitch r8 36 SAVE_XMM 7, u 37 %else 38 %define input rdi 39 %define output rsi 40 %define pitch rdx 41 %endif 42 %endif 43 %endmacro 44 45 %macro STACK_FRAME_DESTROY 0 46 %define input 47 %define output 48 %define pitch 49 50 %if ABI_IS_32BIT 51 pop rdi 52 pop rsi 53 RESTORE_GOT 54 pop rbp 55 %else 56 %if LIBVPX_YASM_WIN64 57 RESTORE_XMM 58 %endif 59 %endif 60 ret 61 %endmacro 62 63 SECTION .text 64 65 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 66 global sym(vp8_short_fdct4x4_sse2) PRIVATE 67 sym(vp8_short_fdct4x4_sse2): 68 69 STACK_FRAME_CREATE 70 71 movq xmm0, MMWORD PTR[input ] ;03 02 01 00 72 movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 73 lea input, [input+2*pitch] 74 movq xmm1, MMWORD PTR[input ] ;23 22 21 20 75 movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 76 77 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 78 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 79 80 movdqa xmm2, xmm0 81 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 82 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 83 movdqa xmm1, xmm0 84 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 85 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 86 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 87 88 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 89 movdqa xmm3, xmm0 90 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 91 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 92 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 93 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 94 95 movdqa xmm1, xmm0 96 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 97 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 98 movdqa xmm4, xmm3 99 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 100 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 101 102 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 103 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 104 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 105 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 106 107 packssdw xmm0, xmm1 ;op[2] op[0] 108 packssdw xmm3, xmm4 ;op[3] op[1] 109 ; 23 22 21 20 03 02 01 00 110 ; 111 ; 33 32 31 30 13 12 11 10 112 ; 113 movdqa xmm2, xmm0 114 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 115 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 116 117 movdqa xmm3, xmm0 118 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 119 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 120 movdqa xmm2, xmm0 121 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 122 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 123 124 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 125 pshufd xmm2, xmm2, 04eh 126 movdqa xmm3, xmm0 127 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 128 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 129 130 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 131 movdqa xmm2, xmm3 ;save d1 for compare 132 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 133 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 134 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 135 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 136 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 137 movdqa xmm1, xmm0 138 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 139 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 140 141 pxor xmm4, xmm4 ;zero out for compare 142 paddd xmm0, xmm5 143 paddd xmm1, xmm5 144 pcmpeqw xmm2, xmm4 145 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 146 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 147 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 148 ;and keep bit 0 of lower 149 150 movdqa xmm4, xmm3 151 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 152 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 153 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 154 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 155 packssdw xmm0, xmm1 ;op[8] op[0] 156 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 157 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 158 159 packssdw xmm3, xmm4 ;op[12] op[4] 160 movdqa xmm1, xmm0 161 paddw xmm3, xmm2 ;op[4] += (d1!=0) 162 punpcklqdq xmm0, xmm3 ;op[4] op[0] 163 punpckhqdq xmm1, xmm3 ;op[12] op[8] 164 165 movdqa XMMWORD PTR[output + 0], xmm0 166 movdqa XMMWORD PTR[output + 16], xmm1 167 168 STACK_FRAME_DESTROY 169 170 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 171 global sym(vp8_short_fdct8x4_sse2) PRIVATE 172 sym(vp8_short_fdct8x4_sse2): 173 174 STACK_FRAME_CREATE 175 176 ; read the input data 177 movdqa xmm0, [input ] 178 movdqa xmm2, [input+ pitch] 179 lea input, [input+2*pitch] 180 movdqa xmm4, [input ] 181 movdqa xmm3, [input+ pitch] 182 183 ; transpose for the first stage 184 movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 185 movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 186 187 punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 188 punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 189 190 punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 191 punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 192 193 movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 194 punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 195 196 punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 197 198 movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 199 punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 200 201 punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 202 movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 203 204 punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 205 punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 206 207 movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 208 punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 209 210 punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 211 212 ; xmm0 0 213 ; xmm1 1 214 ; xmm2 2 215 ; xmm3 3 216 217 ; first stage 218 movdqa xmm5, xmm0 219 movdqa xmm4, xmm1 220 221 paddw xmm0, xmm3 ; a1 = 0 + 3 222 paddw xmm1, xmm2 ; b1 = 1 + 2 223 224 psubw xmm4, xmm2 ; c1 = 1 - 2 225 psubw xmm5, xmm3 ; d1 = 0 - 3 226 227 psllw xmm5, 3 228 psllw xmm4, 3 229 230 psllw xmm0, 3 231 psllw xmm1, 3 232 233 ; output 0 and 2 234 movdqa xmm2, xmm0 ; a1 235 236 paddw xmm0, xmm1 ; op[0] = a1 + b1 237 psubw xmm2, xmm1 ; op[2] = a1 - b1 238 239 ; output 1 and 3 240 ; interleave c1, d1 241 movdqa xmm1, xmm5 ; d1 242 punpcklwd xmm1, xmm4 ; c1 d1 243 punpckhwd xmm5, xmm4 ; c1 d1 244 245 movdqa xmm3, xmm1 246 movdqa xmm4, xmm5 247 248 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 249 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 250 251 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 252 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 253 254 paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 255 paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 256 paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 257 paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 258 259 psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 260 psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 261 psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 262 psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 263 264 packssdw xmm1, xmm4 ; op[1] 265 packssdw xmm3, xmm5 ; op[3] 266 267 ; done with vertical 268 ; transpose for the second stage 269 movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 270 movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 271 272 punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 273 punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 274 275 punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 276 punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 277 278 movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 279 punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 280 281 punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 282 283 movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 284 punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 285 286 punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 287 movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 288 289 punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 290 punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 291 292 movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 293 punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 294 295 punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 296 297 ; xmm0 0 298 ; xmm1 4 299 ; xmm2 1 300 ; xmm3 3 301 302 movdqa xmm5, xmm0 303 movdqa xmm2, xmm1 304 305 paddw xmm0, xmm3 ; a1 = 0 + 3 306 paddw xmm1, xmm4 ; b1 = 1 + 2 307 308 psubw xmm4, xmm2 ; c1 = 1 - 2 309 psubw xmm5, xmm3 ; d1 = 0 - 3 310 311 pxor xmm6, xmm6 ; zero out for compare 312 313 pcmpeqw xmm6, xmm5 ; d1 != 0 314 315 pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 316 ; and keep bit 0 of lower 317 318 ; output 0 and 2 319 movdqa xmm2, xmm0 ; a1 320 321 paddw xmm0, xmm1 ; a1 + b1 322 psubw xmm2, xmm1 ; a1 - b1 323 324 paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 325 paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 326 327 psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 328 psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 329 330 ; output 1 and 3 331 ; interleave c1, d1 332 movdqa xmm1, xmm5 ; d1 333 punpcklwd xmm1, xmm4 ; c1 d1 334 punpckhwd xmm5, xmm4 ; c1 d1 335 336 movdqa xmm3, xmm1 337 movdqa xmm4, xmm5 338 339 pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 340 pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 341 342 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 343 pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 344 345 paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 346 paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 347 paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 348 paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 349 350 psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 351 psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 352 psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 353 psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 354 355 packssdw xmm1, xmm4 ; op[4] 356 packssdw xmm3, xmm5 ; op[12] 357 358 paddw xmm1, xmm6 ; op[4] += (d1!=0) 359 360 movdqa xmm4, xmm0 361 movdqa xmm5, xmm2 362 363 punpcklqdq xmm0, xmm1 364 punpckhqdq xmm4, xmm1 365 366 punpcklqdq xmm2, xmm3 367 punpckhqdq xmm5, xmm3 368 369 movdqa XMMWORD PTR[output + 0 ], xmm0 370 movdqa XMMWORD PTR[output + 16], xmm2 371 movdqa XMMWORD PTR[output + 32], xmm4 372 movdqa XMMWORD PTR[output + 48], xmm5 373 374 STACK_FRAME_DESTROY 375 376 SECTION_RODATA 377 align 16 378 _5352_2217: 379 dw 5352 380 dw 2217 381 dw 5352 382 dw 2217 383 dw 5352 384 dw 2217 385 dw 5352 386 dw 2217 387 align 16 388 _2217_neg5352: 389 dw 2217 390 dw -5352 391 dw 2217 392 dw -5352 393 dw 2217 394 dw -5352 395 dw 2217 396 dw -5352 397 align 16 398 _mult_add: 399 times 8 dw 1 400 align 16 401 _cmp_mask: 402 times 4 dw 1 403 times 4 dw 0 404 align 16 405 _cmp_mask8x4: 406 times 8 dw 1 407 align 16 408 _mult_sub: 409 dw 1 410 dw -1 411 dw 1 412 dw -1 413 dw 1 414 dw -1 415 dw 1 416 dw -1 417 align 16 418 _7: 419 times 4 dd 7 420 align 16 421 _7w: 422 times 8 dw 7 423 align 16 424 _14500: 425 times 4 dd 14500 426 align 16 427 _7500: 428 times 4 dd 7500 429 align 16 430 _12000: 431 times 4 dd 12000 432 align 16 433 _51000: 434 times 4 dd 51000 435