1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ; /**************************************************************************** 15 ; * Notes: 16 ; * 17 ; * This implementation makes use of 16 bit fixed point verio of two multiply 18 ; * constants: 19 ; * 1. sqrt(2) * cos (pi/8) 20 ; * 2. sqrt(2) * sin (pi/8) 21 ; * Becuase the first constant is bigger than 1, to maintain the same 16 bit 22 ; * fixed point prrcision as the second one, we use a trick of 23 ; * x * a = x + x*(a-1) 24 ; * so 25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 26 ; * 27 ; * For the second constant, becuase of the 16bit version is 35468, which 28 ; * is bigger than 32768, in signed 16 bit multiply, it become a negative 29 ; * number. 30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x 31 ; * 32 ; **************************************************************************/ 33 34 35 ;void short_idct4x4llm_mmx(short *input, short *output, int pitch) 36 global sym(vp8_short_idct4x4llm_mmx) 37 sym(vp8_short_idct4x4llm_mmx): 38 push rbp 39 mov rbp, rsp 40 SHADOW_ARGS_TO_STACK 3 41 GET_GOT rbx 42 ; end prolog 43 44 mov rax, arg(0) ;input 45 mov rdx, arg(1) ;output 46 47 movq mm0, [rax ] 48 movq mm1, [rax+ 8] 49 50 movq mm2, [rax+16] 51 movq mm3, [rax+24] 52 53 movsxd rax, dword ptr arg(2) ;pitch 54 55 psubw mm0, mm2 ; b1= 0-2 56 paddw mm2, mm2 ; 57 58 movq mm5, mm1 59 paddw mm2, mm0 ; a1 =0+2 60 61 pmulhw mm5, [GLOBAL(x_s1sqr2)] ; 62 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 63 64 movq mm7, mm3 ; 65 pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ; 66 67 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 68 psubw mm7, mm5 ; c1 69 70 movq mm5, mm1 71 movq mm4, mm3 72 73 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 74 paddw mm5, mm1 75 76 pmulhw mm3, [GLOBAL(x_s1sqr2)] 77 paddw mm3, mm4 78 79 paddw mm3, mm5 ; d1 80 movq mm6, mm2 ; a1 81 82 movq mm4, mm0 ; b1 83 paddw mm2, mm3 ;0 84 85 paddw mm4, mm7 ;1 86 psubw mm0, mm7 ;2 87 88 psubw mm6, mm3 ;3 89 90 movq mm1, mm2 ; 03 02 01 00 91 movq mm3, mm4 ; 23 22 21 20 92 93 punpcklwd mm1, mm0 ; 11 01 10 00 94 punpckhwd mm2, mm0 ; 13 03 12 02 95 96 punpcklwd mm3, mm6 ; 31 21 30 20 97 punpckhwd mm4, mm6 ; 33 23 32 22 98 99 movq mm0, mm1 ; 11 01 10 00 100 movq mm5, mm2 ; 13 03 12 02 101 102 punpckldq mm0, mm3 ; 30 20 10 00 103 punpckhdq mm1, mm3 ; 31 21 11 01 104 105 punpckldq mm2, mm4 ; 32 22 12 02 106 punpckhdq mm5, mm4 ; 33 23 13 03 107 108 movq mm3, mm5 ; 33 23 13 03 109 110 psubw mm0, mm2 ; b1= 0-2 111 paddw mm2, mm2 ; 112 113 movq mm5, mm1 114 paddw mm2, mm0 ; a1 =0+2 115 116 pmulhw mm5, [GLOBAL(x_s1sqr2)] ; 117 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 118 119 movq mm7, mm3 ; 120 pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ; 121 122 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 123 psubw mm7, mm5 ; c1 124 125 movq mm5, mm1 126 movq mm4, mm3 127 128 pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 129 paddw mm5, mm1 130 131 pmulhw mm3, [GLOBAL(x_s1sqr2)] 132 paddw mm3, mm4 133 134 paddw mm3, mm5 ; d1 135 paddw mm0, [GLOBAL(fours)] 136 137 paddw mm2, [GLOBAL(fours)] 138 movq mm6, mm2 ; a1 139 140 movq mm4, mm0 ; b1 141 paddw mm2, mm3 ;0 142 143 paddw mm4, mm7 ;1 144 psubw mm0, mm7 ;2 145 146 psubw mm6, mm3 ;3 147 psraw mm2, 3 148 149 psraw mm0, 3 150 psraw mm4, 3 151 152 psraw mm6, 3 153 154 movq mm1, mm2 ; 03 02 01 00 155 movq mm3, mm4 ; 23 22 21 20 156 157 punpcklwd mm1, mm0 ; 11 01 10 00 158 punpckhwd mm2, mm0 ; 13 03 12 02 159 160 punpcklwd mm3, mm6 ; 31 21 30 20 161 punpckhwd mm4, mm6 ; 33 23 32 22 162 163 movq mm0, mm1 ; 11 01 10 00 164 movq mm5, mm2 ; 13 03 12 02 165 166 punpckldq mm0, mm3 ; 30 20 10 00 167 punpckhdq mm1, mm3 ; 31 21 11 01 168 169 punpckldq mm2, mm4 ; 32 22 12 02 170 punpckhdq mm5, mm4 ; 33 23 13 03 171 172 movq [rdx], mm0 173 174 movq [rdx+rax], mm1 175 movq [rdx+rax*2], mm2 176 177 add rdx, rax 178 movq [rdx+rax*2], mm5 179 180 ; begin epilog 181 RESTORE_GOT 182 UNSHADOW_ARGS 183 pop rbp 184 ret 185 186 187 ;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) 188 global sym(vp8_short_idct4x4llm_1_mmx) 189 sym(vp8_short_idct4x4llm_1_mmx): 190 push rbp 191 mov rbp, rsp 192 SHADOW_ARGS_TO_STACK 3 193 GET_GOT rbx 194 ; end prolog 195 196 mov rax, arg(0) ;input 197 movd mm0, [rax] 198 199 paddw mm0, [GLOBAL(fours)] 200 mov rdx, arg(1) ;output 201 202 psraw mm0, 3 203 movsxd rax, dword ptr arg(2) ;pitch 204 205 punpcklwd mm0, mm0 206 punpckldq mm0, mm0 207 208 movq [rdx], mm0 209 movq [rdx+rax], mm0 210 211 movq [rdx+rax*2], mm0 212 add rdx, rax 213 214 movq [rdx+rax*2], mm0 215 216 217 ; begin epilog 218 RESTORE_GOT 219 UNSHADOW_ARGS 220 pop rbp 221 ret 222 223 ;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) 224 global sym(vp8_dc_only_idct_add_mmx) 225 sym(vp8_dc_only_idct_add_mmx): 226 push rbp 227 mov rbp, rsp 228 SHADOW_ARGS_TO_STACK 5 229 GET_GOT rbx 230 push rsi 231 push rdi 232 ; end prolog 233 234 mov rsi, arg(1) ;s -- prediction 235 mov rdi, arg(2) ;d -- destination 236 movsxd rax, dword ptr arg(4) ;stride 237 movsxd rdx, dword ptr arg(3) ;pitch 238 pxor mm0, mm0 239 240 movd mm5, arg(0) ;input_dc 241 242 paddw mm5, [GLOBAL(fours)] 243 244 psraw mm5, 3 245 246 punpcklwd mm5, mm5 247 punpckldq mm5, mm5 248 249 movd mm1, [rsi] 250 punpcklbw mm1, mm0 251 paddsw mm1, mm5 252 packuswb mm1, mm0 ; pack and unpack to saturate 253 movd [rdi], mm1 254 255 movd mm2, [rsi+rdx] 256 punpcklbw mm2, mm0 257 paddsw mm2, mm5 258 packuswb mm2, mm0 ; pack and unpack to saturate 259 movd [rdi+rax], mm2 260 261 movd mm3, [rsi+2*rdx] 262 punpcklbw mm3, mm0 263 paddsw mm3, mm5 264 packuswb mm3, mm0 ; pack and unpack to saturate 265 movd [rdi+2*rax], mm3 266 267 add rdi, rax 268 add rsi, rdx 269 movd mm4, [rsi+2*rdx] 270 punpcklbw mm4, mm0 271 paddsw mm4, mm5 272 packuswb mm4, mm0 ; pack and unpack to saturate 273 movd [rdi+2*rax], mm4 274 275 ; begin epilog 276 pop rdi 277 pop rsi 278 RESTORE_GOT 279 UNSHADOW_ARGS 280 pop rbp 281 ret 282 283 SECTION_RODATA 284 align 16 285 x_s1sqr2: 286 times 4 dw 0x8A8C 287 align 16 288 x_c1sqr2less1: 289 times 4 dw 0x4E7B 290 align 16 291 fours: 292 times 4 dw 0x0004 293