1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 15 global sym(vp8_short_fdct4x4_sse2) 16 sym(vp8_short_fdct4x4_sse2): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 3 20 ;; SAVE_XMM 21 GET_GOT rbx 22 push rsi 23 push rdi 24 ; end prolog 25 26 mov rsi, arg(0) 27 movsxd rax, DWORD PTR arg(2) 28 lea rdi, [rsi + rax*2] 29 30 movq xmm0, MMWORD PTR[rsi ] ;03 02 01 00 31 movq xmm2, MMWORD PTR[rsi + rax] ;13 12 11 10 32 movq xmm1, MMWORD PTR[rsi + rax*2] ;23 22 21 20 33 movq xmm3, MMWORD PTR[rdi + rax] ;33 32 31 30 34 35 punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 36 punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 37 38 mov rdi, arg(1) 39 40 movdqa xmm2, xmm0 41 punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 42 punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 43 movdqa xmm1, xmm0 44 punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 45 pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 46 pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 47 48 punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 49 movdqa xmm3, xmm0 50 paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 51 psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 52 psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 53 psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 54 movdqa xmm1, xmm0 55 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 56 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 57 movdqa xmm4, xmm3 58 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 59 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 60 61 paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 62 paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 63 psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 64 psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 65 66 packssdw xmm0, xmm1 ;op[2] op[0] 67 packssdw xmm3, xmm4 ;op[3] op[1] 68 ; 23 22 21 20 03 02 01 00 69 ; 70 ; 33 32 31 30 13 12 11 10 71 ; 72 movdqa xmm2, xmm0 73 punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 74 punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 75 76 movdqa xmm3, xmm0 77 punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 78 punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 79 movdqa xmm2, xmm0 80 punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 81 punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 82 83 movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 84 pshufd xmm2, xmm2, 04eh 85 movdqa xmm3, xmm0 86 paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 87 psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 88 89 pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 90 movdqa xmm2, xmm3 ;save d1 for compare 91 pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 92 pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 93 pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 94 pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 95 pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 96 movdqa xmm1, xmm0 97 pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 98 pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 99 100 pxor xmm4, xmm4 ;zero out for compare 101 paddd xmm0, xmm5 102 paddd xmm1, xmm5 103 pcmpeqw xmm2, xmm4 104 psrad xmm0, 4 ;(a1 + b1 + 7)>>4 105 psrad xmm1, 4 ;(a1 - b1 + 7)>>4 106 pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 107 ;and keep bit 0 of lower 108 109 movdqa xmm4, xmm3 110 pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 111 pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 112 paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 113 paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 114 packssdw xmm0, xmm1 ;op[8] op[0] 115 psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 116 psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 117 118 packssdw xmm3, xmm4 ;op[12] op[4] 119 movdqa xmm1, xmm0 120 paddw xmm3, xmm2 ;op[4] += (d1!=0) 121 punpcklqdq xmm0, xmm3 ;op[4] op[0] 122 punpckhqdq xmm1, xmm3 ;op[12] op[8] 123 124 movdqa XMMWORD PTR[rdi + 0], xmm0 125 movdqa XMMWORD PTR[rdi + 16], xmm1 126 127 ; begin epilog 128 pop rdi 129 pop rsi 130 RESTORE_GOT 131 ;; RESTORE_XMM 132 UNSHADOW_ARGS 133 pop rbp 134 ret 135 136 SECTION_RODATA 137 align 16 138 _5352_2217: 139 dw 5352 140 dw 2217 141 dw 5352 142 dw 2217 143 dw 5352 144 dw 2217 145 dw 5352 146 dw 2217 147 align 16 148 _2217_neg5352: 149 dw 2217 150 dw -5352 151 dw 2217 152 dw -5352 153 dw 2217 154 dw -5352 155 dw 2217 156 dw -5352 157 align 16 158 _mult_add: 159 times 8 dw 1 160 align 16 161 _cmp_mask: 162 times 4 dw 1 163 times 4 dw 0 164 165 align 16 166 _mult_sub: 167 dw 1 168 dw -1 169 dw 1 170 dw -1 171 dw 1 172 dw -1 173 dw 1 174 dw -1 175 align 16 176 _7: 177 times 4 dd 7 178 align 16 179 _14500: 180 times 4 dd 14500 181 align 16 182 _7500: 183 times 4 dd 7500 184 align 16 185 _12000: 186 times 4 dd 12000 187 align 16 188 _51000: 189 times 4 dd 51000 190