1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ;unsigned int vp8_sad16x16_wmt( 15 ; unsigned char *src_ptr, 16 ; int src_stride, 17 ; unsigned char *ref_ptr, 18 ; int ref_stride) 19 global sym(vp8_sad16x16_wmt) PRIVATE 20 sym(vp8_sad16x16_wmt): 21 push rbp 22 mov rbp, rsp 23 SHADOW_ARGS_TO_STACK 4 24 SAVE_XMM 6 25 push rsi 26 push rdi 27 ; end prolog 28 29 mov rsi, arg(0) ;src_ptr 30 mov rdi, arg(2) ;ref_ptr 31 32 movsxd rax, dword ptr arg(1) ;src_stride 33 movsxd rdx, dword ptr arg(3) ;ref_stride 34 35 lea rcx, [rsi+rax*8] 36 37 lea rcx, [rcx+rax*8] 38 pxor xmm6, xmm6 39 40 .x16x16sad_wmt_loop: 41 42 movq xmm0, QWORD PTR [rsi] 43 movq xmm2, QWORD PTR [rsi+8] 44 45 movq xmm1, QWORD PTR [rdi] 46 movq xmm3, QWORD PTR [rdi+8] 47 48 movq xmm4, QWORD PTR [rsi+rax] 49 movq xmm5, QWORD PTR [rdi+rdx] 50 51 52 punpcklbw xmm0, xmm2 53 punpcklbw xmm1, xmm3 54 55 psadbw xmm0, xmm1 56 movq xmm2, QWORD PTR [rsi+rax+8] 57 58 movq xmm3, QWORD PTR [rdi+rdx+8] 59 lea rsi, [rsi+rax*2] 60 61 lea rdi, [rdi+rdx*2] 62 punpcklbw xmm4, xmm2 63 64 punpcklbw xmm5, xmm3 65 psadbw xmm4, xmm5 66 67 paddw xmm6, xmm0 68 paddw xmm6, xmm4 69 70 cmp rsi, rcx 71 jne .x16x16sad_wmt_loop 72 73 movq xmm0, xmm6 74 psrldq xmm6, 8 75 76 paddw xmm0, xmm6 77 movq rax, xmm0 78 79 ; begin epilog 80 pop rdi 81 pop rsi 82 RESTORE_XMM 83 UNSHADOW_ARGS 84 pop rbp 85 ret 86 87 ;unsigned int vp8_sad8x16_wmt( 88 ; unsigned char *src_ptr, 89 ; int src_stride, 90 ; unsigned char *ref_ptr, 91 ; int ref_stride, 92 ; int max_sad) 93 global sym(vp8_sad8x16_wmt) PRIVATE 94 sym(vp8_sad8x16_wmt): 95 push rbp 96 mov rbp, rsp 97 SHADOW_ARGS_TO_STACK 5 98 push rbx 99 push rsi 100 push rdi 101 ; end prolog 102 103 mov rsi, arg(0) ;src_ptr 104 mov rdi, arg(2) ;ref_ptr 105 106 movsxd rbx, dword ptr arg(1) ;src_stride 107 movsxd rdx, dword ptr arg(3) ;ref_stride 108 109 lea rcx, [rsi+rbx*8] 110 111 lea rcx, [rcx+rbx*8] 112 pxor mm7, mm7 113 114 .x8x16sad_wmt_loop: 115 116 movq rax, mm7 117 cmp eax, arg(4) 118 ja .x8x16sad_wmt_early_exit 119 120 movq mm0, QWORD PTR [rsi] 121 movq mm1, QWORD PTR [rdi] 122 123 movq mm2, QWORD PTR [rsi+rbx] 124 movq mm3, QWORD PTR [rdi+rdx] 125 126 psadbw mm0, mm1 127 psadbw mm2, mm3 128 129 lea rsi, [rsi+rbx*2] 130 lea rdi, [rdi+rdx*2] 131 132 paddw mm7, mm0 133 paddw mm7, mm2 134 135 cmp rsi, rcx 136 jne .x8x16sad_wmt_loop 137 138 movq rax, mm7 139 140 .x8x16sad_wmt_early_exit: 141 142 ; begin epilog 143 pop rdi 144 pop rsi 145 pop rbx 146 UNSHADOW_ARGS 147 pop rbp 148 ret 149 150 151 ;unsigned int vp8_sad8x8_wmt( 152 ; unsigned char *src_ptr, 153 ; int src_stride, 154 ; unsigned char *ref_ptr, 155 ; int ref_stride) 156 global sym(vp8_sad8x8_wmt) PRIVATE 157 sym(vp8_sad8x8_wmt): 158 push rbp 159 mov rbp, rsp 160 SHADOW_ARGS_TO_STACK 5 161 push rbx 162 push rsi 163 push rdi 164 ; end prolog 165 166 mov rsi, arg(0) ;src_ptr 167 mov rdi, arg(2) ;ref_ptr 168 169 movsxd rbx, dword ptr arg(1) ;src_stride 170 movsxd rdx, dword ptr arg(3) ;ref_stride 171 172 lea rcx, [rsi+rbx*8] 173 pxor mm7, mm7 174 175 .x8x8sad_wmt_loop: 176 177 movq rax, mm7 178 cmp eax, arg(4) 179 ja .x8x8sad_wmt_early_exit 180 181 movq mm0, QWORD PTR [rsi] 182 movq mm1, QWORD PTR [rdi] 183 184 psadbw mm0, mm1 185 lea rsi, [rsi+rbx] 186 187 add rdi, rdx 188 paddw mm7, mm0 189 190 cmp rsi, rcx 191 jne .x8x8sad_wmt_loop 192 193 movq rax, mm7 194 .x8x8sad_wmt_early_exit: 195 196 ; begin epilog 197 pop rdi 198 pop rsi 199 pop rbx 200 UNSHADOW_ARGS 201 pop rbp 202 ret 203 204 ;unsigned int vp8_sad4x4_wmt( 205 ; unsigned char *src_ptr, 206 ; int src_stride, 207 ; unsigned char *ref_ptr, 208 ; int ref_stride) 209 global sym(vp8_sad4x4_wmt) PRIVATE 210 sym(vp8_sad4x4_wmt): 211 push rbp 212 mov rbp, rsp 213 SHADOW_ARGS_TO_STACK 4 214 push rsi 215 push rdi 216 ; end prolog 217 218 mov rsi, arg(0) ;src_ptr 219 mov rdi, arg(2) ;ref_ptr 220 221 movsxd rax, dword ptr arg(1) ;src_stride 222 movsxd rdx, dword ptr arg(3) ;ref_stride 223 224 movd mm0, DWORD PTR [rsi] 225 movd mm1, DWORD PTR [rdi] 226 227 movd mm2, DWORD PTR [rsi+rax] 228 movd mm3, DWORD PTR [rdi+rdx] 229 230 punpcklbw mm0, mm2 231 punpcklbw mm1, mm3 232 233 psadbw mm0, mm1 234 lea rsi, [rsi+rax*2] 235 236 lea rdi, [rdi+rdx*2] 237 movd mm4, DWORD PTR [rsi] 238 239 movd mm5, DWORD PTR [rdi] 240 movd mm6, DWORD PTR [rsi+rax] 241 242 movd mm7, DWORD PTR [rdi+rdx] 243 punpcklbw mm4, mm6 244 245 punpcklbw mm5, mm7 246 psadbw mm4, mm5 247 248 paddw mm0, mm4 249 movq rax, mm0 250 251 ; begin epilog 252 pop rdi 253 pop rsi 254 UNSHADOW_ARGS 255 pop rbp 256 ret 257 258 259 ;unsigned int vp8_sad16x8_wmt( 260 ; unsigned char *src_ptr, 261 ; int src_stride, 262 ; unsigned char *ref_ptr, 263 ; int ref_stride) 264 global sym(vp8_sad16x8_wmt) PRIVATE 265 sym(vp8_sad16x8_wmt): 266 push rbp 267 mov rbp, rsp 268 SHADOW_ARGS_TO_STACK 5 269 push rbx 270 push rsi 271 push rdi 272 ; end prolog 273 274 275 mov rsi, arg(0) ;src_ptr 276 mov rdi, arg(2) ;ref_ptr 277 278 movsxd rbx, dword ptr arg(1) ;src_stride 279 movsxd rdx, dword ptr arg(3) ;ref_stride 280 281 lea rcx, [rsi+rbx*8] 282 pxor mm7, mm7 283 284 .x16x8sad_wmt_loop: 285 286 movq rax, mm7 287 cmp eax, arg(4) 288 ja .x16x8sad_wmt_early_exit 289 290 movq mm0, QWORD PTR [rsi] 291 movq mm2, QWORD PTR [rsi+8] 292 293 movq mm1, QWORD PTR [rdi] 294 movq mm3, QWORD PTR [rdi+8] 295 296 movq mm4, QWORD PTR [rsi+rbx] 297 movq mm5, QWORD PTR [rdi+rdx] 298 299 psadbw mm0, mm1 300 psadbw mm2, mm3 301 302 movq mm1, QWORD PTR [rsi+rbx+8] 303 movq mm3, QWORD PTR [rdi+rdx+8] 304 305 psadbw mm4, mm5 306 psadbw mm1, mm3 307 308 lea rsi, [rsi+rbx*2] 309 lea rdi, [rdi+rdx*2] 310 311 paddw mm0, mm2 312 paddw mm4, mm1 313 314 paddw mm7, mm0 315 paddw mm7, mm4 316 317 cmp rsi, rcx 318 jne .x16x8sad_wmt_loop 319 320 movq rax, mm7 321 322 .x16x8sad_wmt_early_exit: 323 324 ; begin epilog 325 pop rdi 326 pop rsi 327 pop rbx 328 UNSHADOW_ARGS 329 pop rbp 330 ret 331 332 ;void vp8_copy32xn_sse2( 333 ; unsigned char *src_ptr, 334 ; int src_stride, 335 ; unsigned char *dst_ptr, 336 ; int dst_stride, 337 ; int height); 338 global sym(vp8_copy32xn_sse2) PRIVATE 339 sym(vp8_copy32xn_sse2): 340 push rbp 341 mov rbp, rsp 342 SHADOW_ARGS_TO_STACK 5 343 SAVE_XMM 7 344 push rsi 345 push rdi 346 ; end prolog 347 348 mov rsi, arg(0) ;src_ptr 349 mov rdi, arg(2) ;dst_ptr 350 351 movsxd rax, dword ptr arg(1) ;src_stride 352 movsxd rdx, dword ptr arg(3) ;dst_stride 353 movsxd rcx, dword ptr arg(4) ;height 354 355 .block_copy_sse2_loopx4: 356 movdqu xmm0, XMMWORD PTR [rsi] 357 movdqu xmm1, XMMWORD PTR [rsi + 16] 358 movdqu xmm2, XMMWORD PTR [rsi + rax] 359 movdqu xmm3, XMMWORD PTR [rsi + rax + 16] 360 361 lea rsi, [rsi+rax*2] 362 363 movdqu xmm4, XMMWORD PTR [rsi] 364 movdqu xmm5, XMMWORD PTR [rsi + 16] 365 movdqu xmm6, XMMWORD PTR [rsi + rax] 366 movdqu xmm7, XMMWORD PTR [rsi + rax + 16] 367 368 lea rsi, [rsi+rax*2] 369 370 movdqa XMMWORD PTR [rdi], xmm0 371 movdqa XMMWORD PTR [rdi + 16], xmm1 372 movdqa XMMWORD PTR [rdi + rdx], xmm2 373 movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 374 375 lea rdi, [rdi+rdx*2] 376 377 movdqa XMMWORD PTR [rdi], xmm4 378 movdqa XMMWORD PTR [rdi + 16], xmm5 379 movdqa XMMWORD PTR [rdi + rdx], xmm6 380 movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 381 382 lea rdi, [rdi+rdx*2] 383 384 sub rcx, 4 385 cmp rcx, 4 386 jge .block_copy_sse2_loopx4 387 388 cmp rcx, 0 389 je .copy_is_done 390 391 .block_copy_sse2_loop: 392 movdqu xmm0, XMMWORD PTR [rsi] 393 movdqu xmm1, XMMWORD PTR [rsi + 16] 394 lea rsi, [rsi+rax] 395 396 movdqa XMMWORD PTR [rdi], xmm0 397 movdqa XMMWORD PTR [rdi + 16], xmm1 398 lea rdi, [rdi+rdx] 399 400 sub rcx, 1 401 jne .block_copy_sse2_loop 402 403 .copy_is_done: 404 ; begin epilog 405 pop rdi 406 pop rsi 407 RESTORE_XMM 408 UNSHADOW_ARGS 409 pop rbp 410 ret 411