1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %macro PROCESS_16X2X8 1 15 %if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 movq xmm1, MMWORD PTR [rdi] 18 movq xmm3, MMWORD PTR [rdi+8] 19 movq xmm2, MMWORD PTR [rdi+16] 20 punpcklqdq xmm1, xmm3 21 punpcklqdq xmm3, xmm2 22 23 movdqa xmm2, xmm1 24 mpsadbw xmm1, xmm0, 0x0 25 mpsadbw xmm2, xmm0, 0x5 26 27 psrldq xmm0, 8 28 29 movdqa xmm4, xmm3 30 mpsadbw xmm3, xmm0, 0x0 31 mpsadbw xmm4, xmm0, 0x5 32 33 paddw xmm1, xmm2 34 paddw xmm1, xmm3 35 paddw xmm1, xmm4 36 %else 37 movdqa xmm0, XMMWORD PTR [rsi] 38 movq xmm5, MMWORD PTR [rdi] 39 movq xmm3, MMWORD PTR [rdi+8] 40 movq xmm2, MMWORD PTR [rdi+16] 41 punpcklqdq xmm5, xmm3 42 punpcklqdq xmm3, xmm2 43 44 movdqa xmm2, xmm5 45 mpsadbw xmm5, xmm0, 0x0 46 mpsadbw xmm2, xmm0, 0x5 47 48 psrldq xmm0, 8 49 50 movdqa xmm4, xmm3 51 mpsadbw xmm3, xmm0, 0x0 52 mpsadbw xmm4, xmm0, 0x5 53 54 paddw xmm5, xmm2 55 paddw xmm5, xmm3 56 paddw xmm5, xmm4 57 58 paddw xmm1, xmm5 59 %endif 60 movdqa xmm0, XMMWORD PTR [rsi + rax] 61 movq xmm5, MMWORD PTR [rdi+ rdx] 62 movq xmm3, MMWORD PTR [rdi+ rdx+8] 63 movq xmm2, MMWORD PTR [rdi+ rdx+16] 64 punpcklqdq xmm5, xmm3 65 punpcklqdq xmm3, xmm2 66 67 lea rsi, [rsi+rax*2] 68 lea rdi, [rdi+rdx*2] 69 70 movdqa xmm2, xmm5 71 mpsadbw xmm5, xmm0, 0x0 72 mpsadbw xmm2, xmm0, 0x5 73 74 psrldq xmm0, 8 75 movdqa xmm4, xmm3 76 mpsadbw xmm3, xmm0, 0x0 77 mpsadbw xmm4, xmm0, 0x5 78 79 paddw xmm5, xmm2 80 paddw xmm5, xmm3 81 paddw xmm5, xmm4 82 83 paddw xmm1, xmm5 84 %endmacro 85 86 %macro PROCESS_8X2X8 1 87 %if %1 88 movq xmm0, MMWORD PTR [rsi] 89 movq xmm1, MMWORD PTR [rdi] 90 movq xmm3, MMWORD PTR [rdi+8] 91 punpcklqdq xmm1, xmm3 92 93 movdqa xmm2, xmm1 94 mpsadbw xmm1, xmm0, 0x0 95 mpsadbw xmm2, xmm0, 0x5 96 paddw xmm1, xmm2 97 %else 98 movq xmm0, MMWORD PTR [rsi] 99 movq xmm5, MMWORD PTR [rdi] 100 movq xmm3, MMWORD PTR [rdi+8] 101 punpcklqdq xmm5, xmm3 102 103 movdqa xmm2, xmm5 104 mpsadbw xmm5, xmm0, 0x0 105 mpsadbw xmm2, xmm0, 0x5 106 paddw xmm5, xmm2 107 108 paddw xmm1, xmm5 109 %endif 110 movq xmm0, MMWORD PTR [rsi + rax] 111 movq xmm5, MMWORD PTR [rdi+ rdx] 112 movq xmm3, MMWORD PTR [rdi+ rdx+8] 113 punpcklqdq xmm5, xmm3 114 115 lea rsi, [rsi+rax*2] 116 lea rdi, [rdi+rdx*2] 117 118 movdqa xmm2, xmm5 119 mpsadbw xmm5, xmm0, 0x0 120 mpsadbw xmm2, xmm0, 0x5 121 paddw xmm5, xmm2 122 123 paddw xmm1, xmm5 124 %endmacro 125 126 %macro PROCESS_4X2X8 1 127 %if %1 128 movd xmm0, [rsi] 129 movq xmm1, MMWORD PTR [rdi] 130 movq xmm3, MMWORD PTR [rdi+8] 131 punpcklqdq xmm1, xmm3 132 133 mpsadbw xmm1, xmm0, 0x0 134 %else 135 movd xmm0, [rsi] 136 movq xmm5, MMWORD PTR [rdi] 137 movq xmm3, MMWORD PTR [rdi+8] 138 punpcklqdq xmm5, xmm3 139 140 mpsadbw xmm5, xmm0, 0x0 141 142 paddw xmm1, xmm5 143 %endif 144 movd xmm0, [rsi + rax] 145 movq xmm5, MMWORD PTR [rdi+ rdx] 146 movq xmm3, MMWORD PTR [rdi+ rdx+8] 147 punpcklqdq xmm5, xmm3 148 149 lea rsi, [rsi+rax*2] 150 lea rdi, [rdi+rdx*2] 151 152 mpsadbw xmm5, xmm0, 0x0 153 154 paddw xmm1, xmm5 155 %endmacro 156 157 158 ;void vp8_sad16x16x8_sse4( 159 ; const unsigned char *src_ptr, 160 ; int src_stride, 161 ; const unsigned char *ref_ptr, 162 ; int ref_stride, 163 ; unsigned short *sad_array); 164 global sym(vp8_sad16x16x8_sse4) 165 sym(vp8_sad16x16x8_sse4): 166 push rbp 167 mov rbp, rsp 168 SHADOW_ARGS_TO_STACK 5 169 push rsi 170 push rdi 171 ; end prolog 172 173 mov rsi, arg(0) ;src_ptr 174 mov rdi, arg(2) ;ref_ptr 175 176 movsxd rax, dword ptr arg(1) ;src_stride 177 movsxd rdx, dword ptr arg(3) ;ref_stride 178 179 PROCESS_16X2X8 1 180 PROCESS_16X2X8 0 181 PROCESS_16X2X8 0 182 PROCESS_16X2X8 0 183 PROCESS_16X2X8 0 184 PROCESS_16X2X8 0 185 PROCESS_16X2X8 0 186 PROCESS_16X2X8 0 187 188 mov rdi, arg(4) ;Results 189 movdqa XMMWORD PTR [rdi], xmm1 190 191 ; begin epilog 192 pop rdi 193 pop rsi 194 UNSHADOW_ARGS 195 pop rbp 196 ret 197 198 199 ;void vp8_sad16x8x8_sse4( 200 ; const unsigned char *src_ptr, 201 ; int src_stride, 202 ; const unsigned char *ref_ptr, 203 ; int ref_stride, 204 ; unsigned short *sad_array 205 ;); 206 global sym(vp8_sad16x8x8_sse4) 207 sym(vp8_sad16x8x8_sse4): 208 push rbp 209 mov rbp, rsp 210 SHADOW_ARGS_TO_STACK 5 211 push rsi 212 push rdi 213 ; end prolog 214 215 mov rsi, arg(0) ;src_ptr 216 mov rdi, arg(2) ;ref_ptr 217 218 movsxd rax, dword ptr arg(1) ;src_stride 219 movsxd rdx, dword ptr arg(3) ;ref_stride 220 221 PROCESS_16X2X8 1 222 PROCESS_16X2X8 0 223 PROCESS_16X2X8 0 224 PROCESS_16X2X8 0 225 226 mov rdi, arg(4) ;Results 227 movdqa XMMWORD PTR [rdi], xmm1 228 229 ; begin epilog 230 pop rdi 231 pop rsi 232 UNSHADOW_ARGS 233 pop rbp 234 ret 235 236 237 ;void vp8_sad8x8x8_sse4( 238 ; const unsigned char *src_ptr, 239 ; int src_stride, 240 ; const unsigned char *ref_ptr, 241 ; int ref_stride, 242 ; unsigned short *sad_array 243 ;); 244 global sym(vp8_sad8x8x8_sse4) 245 sym(vp8_sad8x8x8_sse4): 246 push rbp 247 mov rbp, rsp 248 SHADOW_ARGS_TO_STACK 5 249 push rsi 250 push rdi 251 ; end prolog 252 253 mov rsi, arg(0) ;src_ptr 254 mov rdi, arg(2) ;ref_ptr 255 256 movsxd rax, dword ptr arg(1) ;src_stride 257 movsxd rdx, dword ptr arg(3) ;ref_stride 258 259 PROCESS_8X2X8 1 260 PROCESS_8X2X8 0 261 PROCESS_8X2X8 0 262 PROCESS_8X2X8 0 263 264 mov rdi, arg(4) ;Results 265 movdqa XMMWORD PTR [rdi], xmm1 266 267 ; begin epilog 268 pop rdi 269 pop rsi 270 UNSHADOW_ARGS 271 pop rbp 272 ret 273 274 275 ;void vp8_sad8x16x8_sse4( 276 ; const unsigned char *src_ptr, 277 ; int src_stride, 278 ; const unsigned char *ref_ptr, 279 ; int ref_stride, 280 ; unsigned short *sad_array 281 ;); 282 global sym(vp8_sad8x16x8_sse4) 283 sym(vp8_sad8x16x8_sse4): 284 push rbp 285 mov rbp, rsp 286 SHADOW_ARGS_TO_STACK 5 287 push rsi 288 push rdi 289 ; end prolog 290 291 mov rsi, arg(0) ;src_ptr 292 mov rdi, arg(2) ;ref_ptr 293 294 movsxd rax, dword ptr arg(1) ;src_stride 295 movsxd rdx, dword ptr arg(3) ;ref_stride 296 297 PROCESS_8X2X8 1 298 PROCESS_8X2X8 0 299 PROCESS_8X2X8 0 300 PROCESS_8X2X8 0 301 PROCESS_8X2X8 0 302 PROCESS_8X2X8 0 303 PROCESS_8X2X8 0 304 PROCESS_8X2X8 0 305 mov rdi, arg(4) ;Results 306 movdqa XMMWORD PTR [rdi], xmm1 307 308 ; begin epilog 309 pop rdi 310 pop rsi 311 UNSHADOW_ARGS 312 pop rbp 313 ret 314 315 316 ;void vp8_sad4x4x8_c( 317 ; const unsigned char *src_ptr, 318 ; int src_stride, 319 ; const unsigned char *ref_ptr, 320 ; int ref_stride, 321 ; unsigned short *sad_array 322 ;); 323 global sym(vp8_sad4x4x8_sse4) 324 sym(vp8_sad4x4x8_sse4): 325 push rbp 326 mov rbp, rsp 327 SHADOW_ARGS_TO_STACK 5 328 push rsi 329 push rdi 330 ; end prolog 331 332 mov rsi, arg(0) ;src_ptr 333 mov rdi, arg(2) ;ref_ptr 334 335 movsxd rax, dword ptr arg(1) ;src_stride 336 movsxd rdx, dword ptr arg(3) ;ref_stride 337 338 PROCESS_4X2X8 1 339 PROCESS_4X2X8 0 340 341 mov rdi, arg(4) ;Results 342 movdqa XMMWORD PTR [rdi], xmm1 343 344 ; begin epilog 345 pop rdi 346 pop rsi 347 UNSHADOW_ARGS 348 pop rbp 349 ret 350 351 352 353 354