1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 ;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride) 14 global sym(vp8_recon_b_mmx) 15 sym(vp8_recon_b_mmx): 16 push rbp 17 mov rbp, rsp 18 SHADOW_ARGS_TO_STACK 4 19 push rsi 20 push rdi 21 ; end prolog 22 23 mov rsi, arg(0) ;s 24 mov rdi, arg(2) ;d 25 mov rdx, arg(1) ;q 26 movsxd rax, dword ptr arg(3) ;stride 27 pxor mm0, mm0 28 29 movd mm1, [rsi] 30 punpcklbw mm1, mm0 31 paddsw mm1, [rdx] 32 packuswb mm1, mm0 ; pack and unpack to saturate 33 movd [rdi], mm1 34 35 movd mm2, [rsi+16] 36 punpcklbw mm2, mm0 37 paddsw mm2, [rdx+32] 38 packuswb mm2, mm0 ; pack and unpack to saturate 39 movd [rdi+rax], mm2 40 41 movd mm3, [rsi+32] 42 punpcklbw mm3, mm0 43 paddsw mm3, [rdx+64] 44 packuswb mm3, mm0 ; pack and unpack to saturate 45 movd [rdi+2*rax], mm3 46 47 add rdi, rax 48 movd mm4, [rsi+48] 49 punpcklbw mm4, mm0 50 paddsw mm4, [rdx+96] 51 packuswb mm4, mm0 ; pack and unpack to saturate 52 movd [rdi+2*rax], mm4 53 54 ; begin epilog 55 pop rdi 56 pop rsi 57 UNSHADOW_ARGS 58 pop rbp 59 ret 60 61 62 ;void copy_mem8x8_mmx( 63 ; unsigned char *src, 64 ; int src_stride, 65 ; unsigned char *dst, 66 ; int dst_stride 67 ; ) 68 global sym(vp8_copy_mem8x8_mmx) 69 sym(vp8_copy_mem8x8_mmx): 70 push rbp 71 mov rbp, rsp 72 SHADOW_ARGS_TO_STACK 4 73 push rsi 74 push rdi 75 ; end prolog 76 77 mov rsi, arg(0) ;src; 78 movq mm0, [rsi] 79 80 movsxd rax, dword ptr arg(1) ;src_stride; 81 mov rdi, arg(2) ;dst; 82 83 movq mm1, [rsi+rax] 84 movq mm2, [rsi+rax*2] 85 86 movsxd rcx, dword ptr arg(3) ;dst_stride 87 lea rsi, [rsi+rax*2] 88 89 movq [rdi], mm0 90 add rsi, rax 91 92 movq [rdi+rcx], mm1 93 movq [rdi+rcx*2], mm2 94 95 96 lea rdi, [rdi+rcx*2] 97 movq mm3, [rsi] 98 99 add rdi, rcx 100 movq mm4, [rsi+rax] 101 102 movq mm5, [rsi+rax*2] 103 movq [rdi], mm3 104 105 lea rsi, [rsi+rax*2] 106 movq [rdi+rcx], mm4 107 108 movq [rdi+rcx*2], mm5 109 lea rdi, [rdi+rcx*2] 110 111 movq mm0, [rsi+rax] 112 movq mm1, [rsi+rax*2] 113 114 movq [rdi+rcx], mm0 115 movq [rdi+rcx*2],mm1 116 117 ; begin epilog 118 pop rdi 119 pop rsi 120 UNSHADOW_ARGS 121 pop rbp 122 ret 123 124 125 ;void copy_mem8x4_mmx( 126 ; unsigned char *src, 127 ; int src_stride, 128 ; unsigned char *dst, 129 ; int dst_stride 130 ; ) 131 global sym(vp8_copy_mem8x4_mmx) 132 sym(vp8_copy_mem8x4_mmx): 133 push rbp 134 mov rbp, rsp 135 SHADOW_ARGS_TO_STACK 4 136 push rsi 137 push rdi 138 ; end prolog 139 140 mov rsi, arg(0) ;src; 141 movq mm0, [rsi] 142 143 movsxd rax, dword ptr arg(1) ;src_stride; 144 mov rdi, arg(2) ;dst; 145 146 movq mm1, [rsi+rax] 147 movq mm2, [rsi+rax*2] 148 149 movsxd rcx, dword ptr arg(3) ;dst_stride 150 lea rsi, [rsi+rax*2] 151 152 movq [rdi], mm0 153 movq [rdi+rcx], mm1 154 155 movq [rdi+rcx*2], mm2 156 lea rdi, [rdi+rcx*2] 157 158 movq mm3, [rsi+rax] 159 movq [rdi+rcx], mm3 160 161 ; begin epilog 162 pop rdi 163 pop rsi 164 UNSHADOW_ARGS 165 pop rbp 166 ret 167 168 169 ;void copy_mem16x16_mmx( 170 ; unsigned char *src, 171 ; int src_stride, 172 ; unsigned char *dst, 173 ; int dst_stride 174 ; ) 175 global sym(vp8_copy_mem16x16_mmx) 176 sym(vp8_copy_mem16x16_mmx): 177 push rbp 178 mov rbp, rsp 179 SHADOW_ARGS_TO_STACK 4 180 push rsi 181 push rdi 182 ; end prolog 183 184 mov rsi, arg(0) ;src; 185 movsxd rax, dword ptr arg(1) ;src_stride; 186 187 mov rdi, arg(2) ;dst; 188 movsxd rcx, dword ptr arg(3) ;dst_stride 189 190 movq mm0, [rsi] 191 movq mm3, [rsi+8]; 192 193 movq mm1, [rsi+rax] 194 movq mm4, [rsi+rax+8] 195 196 movq mm2, [rsi+rax*2] 197 movq mm5, [rsi+rax*2+8] 198 199 lea rsi, [rsi+rax*2] 200 add rsi, rax 201 202 movq [rdi], mm0 203 movq [rdi+8], mm3 204 205 movq [rdi+rcx], mm1 206 movq [rdi+rcx+8], mm4 207 208 movq [rdi+rcx*2], mm2 209 movq [rdi+rcx*2+8], mm5 210 211 lea rdi, [rdi+rcx*2] 212 add rdi, rcx 213 214 movq mm0, [rsi] 215 movq mm3, [rsi+8]; 216 217 movq mm1, [rsi+rax] 218 movq mm4, [rsi+rax+8] 219 220 movq mm2, [rsi+rax*2] 221 movq mm5, [rsi+rax*2+8] 222 223 lea rsi, [rsi+rax*2] 224 add rsi, rax 225 226 movq [rdi], mm0 227 movq [rdi+8], mm3 228 229 movq [rdi+rcx], mm1 230 movq [rdi+rcx+8], mm4 231 232 movq [rdi+rcx*2], mm2 233 movq [rdi+rcx*2+8], mm5 234 235 lea rdi, [rdi+rcx*2] 236 add rdi, rcx 237 238 movq mm0, [rsi] 239 movq mm3, [rsi+8]; 240 241 movq mm1, [rsi+rax] 242 movq mm4, [rsi+rax+8] 243 244 movq mm2, [rsi+rax*2] 245 movq mm5, [rsi+rax*2+8] 246 247 lea rsi, [rsi+rax*2] 248 add rsi, rax 249 250 movq [rdi], mm0 251 movq [rdi+8], mm3 252 253 movq [rdi+rcx], mm1 254 movq [rdi+rcx+8], mm4 255 256 movq [rdi+rcx*2], mm2 257 movq [rdi+rcx*2+8], mm5 258 259 lea rdi, [rdi+rcx*2] 260 add rdi, rcx 261 262 movq mm0, [rsi] 263 movq mm3, [rsi+8]; 264 265 movq mm1, [rsi+rax] 266 movq mm4, [rsi+rax+8] 267 268 movq mm2, [rsi+rax*2] 269 movq mm5, [rsi+rax*2+8] 270 271 lea rsi, [rsi+rax*2] 272 add rsi, rax 273 274 movq [rdi], mm0 275 movq [rdi+8], mm3 276 277 movq [rdi+rcx], mm1 278 movq [rdi+rcx+8], mm4 279 280 movq [rdi+rcx*2], mm2 281 movq [rdi+rcx*2+8], mm5 282 283 lea rdi, [rdi+rcx*2] 284 add rdi, rcx 285 286 movq mm0, [rsi] 287 movq mm3, [rsi+8]; 288 289 movq mm1, [rsi+rax] 290 movq mm4, [rsi+rax+8] 291 292 movq mm2, [rsi+rax*2] 293 movq mm5, [rsi+rax*2+8] 294 295 lea rsi, [rsi+rax*2] 296 add rsi, rax 297 298 movq [rdi], mm0 299 movq [rdi+8], mm3 300 301 movq [rdi+rcx], mm1 302 movq [rdi+rcx+8], mm4 303 304 movq [rdi+rcx*2], mm2 305 movq [rdi+rcx*2+8], mm5 306 307 lea rdi, [rdi+rcx*2] 308 add rdi, rcx 309 310 movq mm0, [rsi] 311 movq mm3, [rsi+8]; 312 313 movq [rdi], mm0 314 movq [rdi+8], mm3 315 316 ; begin epilog 317 pop rdi 318 pop rsi 319 UNSHADOW_ARGS 320 pop rbp 321 ret 322