1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 ;unsigned int vp8_sad16x16_wmt( 15 ; unsigned char *src_ptr, 16 ; int src_stride, 17 ; unsigned char *ref_ptr, 18 ; int ref_stride) 19 global sym(vp8_sad16x16_wmt) 20 sym(vp8_sad16x16_wmt): 21 push rbp 22 mov rbp, rsp 23 SHADOW_ARGS_TO_STACK 4 24 push rsi 25 push rdi 26 ; end prolog 27 28 mov rsi, arg(0) ;src_ptr 29 mov rdi, arg(2) ;ref_ptr 30 31 movsxd rax, dword ptr arg(1) ;src_stride 32 movsxd rdx, dword ptr arg(3) ;ref_stride 33 34 lea rcx, [rsi+rax*8] 35 36 lea rcx, [rcx+rax*8] 37 pxor xmm7, xmm7 38 39 x16x16sad_wmt_loop: 40 41 movq xmm0, QWORD PTR [rsi] 42 movq xmm2, QWORD PTR [rsi+8] 43 44 movq xmm1, QWORD PTR [rdi] 45 movq xmm3, QWORD PTR [rdi+8] 46 47 movq xmm4, QWORD PTR [rsi+rax] 48 movq xmm5, QWORD PTR [rdi+rdx] 49 50 51 punpcklbw xmm0, xmm2 52 punpcklbw xmm1, xmm3 53 54 psadbw xmm0, xmm1 55 movq xmm6, QWORD PTR [rsi+rax+8] 56 57 movq xmm3, QWORD PTR [rdi+rdx+8] 58 lea rsi, [rsi+rax*2] 59 60 lea rdi, [rdi+rdx*2] 61 punpcklbw xmm4, xmm6 62 63 punpcklbw xmm5, xmm3 64 psadbw xmm4, xmm5 65 66 paddw xmm7, xmm0 67 paddw xmm7, xmm4 68 69 cmp rsi, rcx 70 jne x16x16sad_wmt_loop 71 72 movq xmm0, xmm7 73 psrldq xmm7, 8 74 75 paddw xmm0, xmm7 76 movq rax, xmm0 77 78 ; begin epilog 79 pop rdi 80 pop rsi 81 UNSHADOW_ARGS 82 pop rbp 83 ret 84 85 ;unsigned int vp8_sad8x16_wmt( 86 ; unsigned char *src_ptr, 87 ; int src_stride, 88 ; unsigned char *ref_ptr, 89 ; int ref_stride, 90 ; int max_err) 91 global sym(vp8_sad8x16_wmt) 92 sym(vp8_sad8x16_wmt): 93 push rbp 94 mov rbp, rsp 95 SHADOW_ARGS_TO_STACK 5 96 push rbx 97 push rsi 98 push rdi 99 ; end prolog 100 101 mov rsi, arg(0) ;src_ptr 102 mov rdi, arg(2) ;ref_ptr 103 104 movsxd rbx, dword ptr arg(1) ;src_stride 105 movsxd rdx, dword ptr arg(3) ;ref_stride 106 107 lea rcx, [rsi+rbx*8] 108 109 lea rcx, [rcx+rbx*8] 110 pxor mm7, mm7 111 112 x8x16sad_wmt_loop: 113 114 movq rax, mm7 115 cmp rax, arg(4) 116 jg x8x16sad_wmt_early_exit 117 118 movq mm0, QWORD PTR [rsi] 119 movq mm1, QWORD PTR [rdi] 120 121 movq mm2, QWORD PTR [rsi+rbx] 122 movq mm3, QWORD PTR [rdi+rdx] 123 124 psadbw mm0, mm1 125 psadbw mm2, mm3 126 127 lea rsi, [rsi+rbx*2] 128 lea rdi, [rdi+rdx*2] 129 130 paddw mm7, mm0 131 paddw mm7, mm2 132 133 cmp rsi, rcx 134 jne x8x16sad_wmt_loop 135 136 movq rax, mm7 137 138 x8x16sad_wmt_early_exit: 139 140 ; begin epilog 141 pop rdi 142 pop rsi 143 pop rbx 144 UNSHADOW_ARGS 145 pop rbp 146 ret 147 148 149 ;unsigned int vp8_sad8x8_wmt( 150 ; unsigned char *src_ptr, 151 ; int src_stride, 152 ; unsigned char *ref_ptr, 153 ; int ref_stride) 154 global sym(vp8_sad8x8_wmt) 155 sym(vp8_sad8x8_wmt): 156 push rbp 157 mov rbp, rsp 158 SHADOW_ARGS_TO_STACK 5 159 push rbx 160 push rsi 161 push rdi 162 ; end prolog 163 164 mov rsi, arg(0) ;src_ptr 165 mov rdi, arg(2) ;ref_ptr 166 167 movsxd rbx, dword ptr arg(1) ;src_stride 168 movsxd rdx, dword ptr arg(3) ;ref_stride 169 170 lea rcx, [rsi+rbx*8] 171 pxor mm7, mm7 172 173 x8x8sad_wmt_loop: 174 175 movq rax, mm7 176 cmp rax, arg(4) 177 jg x8x8sad_wmt_early_exit 178 179 movq mm0, QWORD PTR [rsi] 180 movq mm1, QWORD PTR [rdi] 181 182 psadbw mm0, mm1 183 lea rsi, [rsi+rbx] 184 185 add rdi, rdx 186 paddw mm7, mm0 187 188 cmp rsi, rcx 189 jne x8x8sad_wmt_loop 190 191 movq rax, mm7 192 x8x8sad_wmt_early_exit: 193 194 ; begin epilog 195 pop rdi 196 pop rsi 197 pop rbx 198 UNSHADOW_ARGS 199 pop rbp 200 ret 201 202 ;unsigned int vp8_sad4x4_wmt( 203 ; unsigned char *src_ptr, 204 ; int src_stride, 205 ; unsigned char *ref_ptr, 206 ; int ref_stride) 207 global sym(vp8_sad4x4_wmt) 208 sym(vp8_sad4x4_wmt): 209 push rbp 210 mov rbp, rsp 211 SHADOW_ARGS_TO_STACK 4 212 push rsi 213 push rdi 214 ; end prolog 215 216 mov rsi, arg(0) ;src_ptr 217 mov rdi, arg(2) ;ref_ptr 218 219 movsxd rax, dword ptr arg(1) ;src_stride 220 movsxd rdx, dword ptr arg(3) ;ref_stride 221 222 movd mm0, DWORD PTR [rsi] 223 movd mm1, DWORD PTR [rdi] 224 225 movd mm2, DWORD PTR [rsi+rax] 226 movd mm3, DWORD PTR [rdi+rdx] 227 228 punpcklbw mm0, mm2 229 punpcklbw mm1, mm3 230 231 psadbw mm0, mm1 232 lea rsi, [rsi+rax*2] 233 234 lea rdi, [rdi+rdx*2] 235 movd mm4, DWORD PTR [rsi] 236 237 movd mm5, DWORD PTR [rdi] 238 movd mm6, DWORD PTR [rsi+rax] 239 240 movd mm7, DWORD PTR [rdi+rdx] 241 punpcklbw mm4, mm6 242 243 punpcklbw mm5, mm7 244 psadbw mm4, mm5 245 246 paddw mm0, mm4 247 movq rax, mm0 248 249 ; begin epilog 250 pop rdi 251 pop rsi 252 UNSHADOW_ARGS 253 pop rbp 254 ret 255 256 257 ;unsigned int vp8_sad16x8_wmt( 258 ; unsigned char *src_ptr, 259 ; int src_stride, 260 ; unsigned char *ref_ptr, 261 ; int ref_stride) 262 global sym(vp8_sad16x8_wmt) 263 sym(vp8_sad16x8_wmt): 264 push rbp 265 mov rbp, rsp 266 SHADOW_ARGS_TO_STACK 5 267 push rbx 268 push rsi 269 push rdi 270 ; end prolog 271 272 273 mov rsi, arg(0) ;src_ptr 274 mov rdi, arg(2) ;ref_ptr 275 276 movsxd rbx, dword ptr arg(1) ;src_stride 277 movsxd rdx, dword ptr arg(3) ;ref_stride 278 279 lea rcx, [rsi+rbx*8] 280 pxor mm7, mm7 281 282 x16x8sad_wmt_loop: 283 284 movq rax, mm7 285 cmp rax, arg(4) 286 jg x16x8sad_wmt_early_exit 287 288 movq mm0, QWORD PTR [rsi] 289 movq mm2, QWORD PTR [rsi+8] 290 291 movq mm1, QWORD PTR [rdi] 292 movq mm3, QWORD PTR [rdi+8] 293 294 movq mm4, QWORD PTR [rsi+rbx] 295 movq mm5, QWORD PTR [rdi+rdx] 296 297 psadbw mm0, mm1 298 psadbw mm2, mm3 299 300 movq mm1, QWORD PTR [rsi+rbx+8] 301 movq mm3, QWORD PTR [rdi+rdx+8] 302 303 psadbw mm4, mm5 304 psadbw mm1, mm3 305 306 lea rsi, [rsi+rbx*2] 307 lea rdi, [rdi+rdx*2] 308 309 paddw mm0, mm2 310 paddw mm4, mm1 311 312 paddw mm7, mm0 313 paddw mm7, mm4 314 315 cmp rsi, rcx 316 jne x16x8sad_wmt_loop 317 318 movq rax, mm7 319 320 x16x8sad_wmt_early_exit: 321 322 ; begin epilog 323 pop rdi 324 pop rsi 325 pop rbx 326 UNSHADOW_ARGS 327 pop rbp 328 ret 329