1 ; 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 12 %include "vpx_ports/x86_abi_support.asm" 13 14 %define VP8_FILTER_WEIGHT 128 15 %define VP8_FILTER_SHIFT 7 16 17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst, 18 ; int pitch, int rows, int cols,int flimit) 19 extern sym(vp8_rv) 20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE 21 sym(vp8_mbpost_proc_down_mmx): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 5 25 GET_GOT rbx 26 push rsi 27 push rdi 28 ; end prolog 29 30 ALIGN_STACK 16, rax 31 sub rsp, 136 32 33 ; unsigned char d[16][8] at [rsp] 34 ; create flimit2 at [rsp+128] 35 mov eax, dword ptr arg(4) ;flimit 36 mov [rsp+128], eax 37 mov [rsp+128+4], eax 38 %define flimit2 [rsp+128] 39 40 %if ABI_IS_32BIT=0 41 lea r8, [GLOBAL(sym(vp8_rv))] 42 %endif 43 44 ;rows +=8; 45 add dword ptr arg(2), 8 46 47 ;for(c=0; c<cols; c+=4) 48 .loop_col: 49 mov rsi, arg(0) ;s 50 pxor mm0, mm0 ; 51 52 movsxd rax, dword ptr arg(1) ;pitch ; 53 54 ; this copies the last row down into the border 8 rows 55 mov rdi, rsi 56 mov rdx, arg(2) 57 sub rdx, 9 58 imul rdx, rax 59 lea rdi, [rdi+rdx] 60 movq mm1, QWORD ptr[rdi] ; first row 61 mov rcx, 8 62 .init_borderd ; initialize borders 63 lea rdi, [rdi + rax] 64 movq [rdi], mm1 65 66 dec rcx 67 jne .init_borderd 68 69 neg rax ; rax = -pitch 70 71 ; this copies the first row up into the border 8 rows 72 mov rdi, rsi 73 movq mm1, QWORD ptr[rdi] ; first row 74 mov rcx, 8 75 .init_border ; initialize borders 76 lea rdi, [rdi + rax] 77 movq [rdi], mm1 78 79 dec rcx 80 jne .init_border 81 82 83 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 84 neg rax 85 86 87 pxor mm5, mm5 88 pxor mm6, mm6 ; 89 90 pxor mm7, mm7 ; 91 mov rdi, rsi 92 93 mov rcx, 15 ; 94 95 .loop_initvar: 96 movd mm1, DWORD PTR [rdi]; 97 punpcklbw mm1, mm0 ; 98 99 paddw mm5, mm1 ; 100 pmullw mm1, mm1 ; 101 102 movq mm2, mm1 ; 103 punpcklwd mm1, mm0 ; 104 105 punpckhwd mm2, mm0 ; 106 paddd mm6, mm1 ; 107 108 paddd mm7, mm2 ; 109 lea rdi, [rdi+rax] ; 110 111 dec rcx 112 jne .loop_initvar 113 ;save the var and sum 114 xor rdx, rdx 115 .loop_row: 116 movd mm1, DWORD PTR [rsi] ; [s-pitch*8] 117 movd mm2, DWORD PTR [rdi] ; [s+pitch*7] 118 119 punpcklbw mm1, mm0 120 punpcklbw mm2, mm0 121 122 paddw mm5, mm2 123 psubw mm5, mm1 124 125 pmullw mm2, mm2 126 movq mm4, mm2 127 128 punpcklwd mm2, mm0 129 punpckhwd mm4, mm0 130 131 paddd mm6, mm2 132 paddd mm7, mm4 133 134 pmullw mm1, mm1 135 movq mm2, mm1 136 137 punpcklwd mm1, mm0 138 psubd mm6, mm1 139 140 punpckhwd mm2, mm0 141 psubd mm7, mm2 142 143 144 movq mm3, mm6 145 pslld mm3, 4 146 147 psubd mm3, mm6 148 movq mm1, mm5 149 150 movq mm4, mm5 151 pmullw mm1, mm1 152 153 pmulhw mm4, mm4 154 movq mm2, mm1 155 156 punpcklwd mm1, mm4 157 punpckhwd mm2, mm4 158 159 movq mm4, mm7 160 pslld mm4, 4 161 162 psubd mm4, mm7 163 164 psubd mm3, mm1 165 psubd mm4, mm2 166 167 psubd mm3, flimit2 168 psubd mm4, flimit2 169 170 psrad mm3, 31 171 psrad mm4, 31 172 173 packssdw mm3, mm4 174 packsswb mm3, mm0 175 176 movd mm1, DWORD PTR [rsi+rax*8] 177 178 movq mm2, mm1 179 punpcklbw mm1, mm0 180 181 paddw mm1, mm5 182 mov rcx, rdx 183 184 and rcx, 127 185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1 186 push rax 187 lea rax, [GLOBAL(sym(vp8_rv))] 188 movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] 189 pop rax 190 %elif ABI_IS_32BIT=0 191 movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 192 %else 193 movq mm4, [sym(vp8_rv) + rcx*2] 194 %endif 195 paddw mm1, mm4 196 psraw mm1, 4 197 198 packuswb mm1, mm0 199 pand mm1, mm3 200 201 pandn mm3, mm2 202 por mm1, mm3 203 204 and rcx, 15 205 movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] 206 207 cmp edx, 8 208 jl .skip_assignment 209 210 mov rcx, rdx 211 sub rcx, 8 212 and rcx, 15 213 movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] 214 movd [rsi], mm1 215 216 .skip_assignment 217 lea rsi, [rsi+rax] 218 219 lea rdi, [rdi+rax] 220 add rdx, 1 221 222 cmp edx, dword arg(2) ;rows 223 jl .loop_row 224 225 226 add dword arg(0), 4 ; s += 4 227 sub dword arg(3), 4 ; cols -= 4 228 cmp dword arg(3), 0 229 jg .loop_col 230 231 add rsp, 136 232 pop rsp 233 234 ; begin epilog 235 pop rdi 236 pop rsi 237 RESTORE_GOT 238 UNSHADOW_ARGS 239 pop rbp 240 ret 241 %undef flimit2 242 243 244 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, 245 ; unsigned char blackclamp[16], 246 ; unsigned char whiteclamp[16], 247 ; unsigned char bothclamp[16], 248 ; unsigned int Width, unsigned int Height, int Pitch) 249 global sym(vp8_plane_add_noise_mmx) PRIVATE 250 sym(vp8_plane_add_noise_mmx): 251 push rbp 252 mov rbp, rsp 253 SHADOW_ARGS_TO_STACK 8 254 GET_GOT rbx 255 push rsi 256 push rdi 257 ; end prolog 258 259 .addnoise_loop: 260 call sym(LIBVPX_RAND) WRT_PLT 261 mov rcx, arg(1) ;noise 262 and rax, 0xff 263 add rcx, rax 264 265 ; we rely on the fact that the clamping vectors are stored contiguously 266 ; in black/white/both order. Note that we have to reload this here because 267 ; rdx could be trashed by rand() 268 mov rdx, arg(2) ; blackclamp 269 270 271 mov rdi, rcx 272 movsxd rcx, dword arg(5) ;[Width] 273 mov rsi, arg(0) ;Pos 274 xor rax,rax 275 276 .addnoise_nextset: 277 movq mm1,[rsi+rax] ; get the source 278 279 psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 280 paddusb mm1, [rdx+32] ;bothclamp 281 psubusb mm1, [rdx+16] ;whiteclamp 282 283 movq mm2,[rdi+rax] ; get the noise for this line 284 paddb mm1,mm2 ; add it in 285 movq [rsi+rax],mm1 ; store the result 286 287 add rax,8 ; move to the next line 288 289 cmp rax, rcx 290 jl .addnoise_nextset 291 292 movsxd rax, dword arg(7) ; Pitch 293 add arg(0), rax ; Start += Pitch 294 sub dword arg(6), 1 ; Height -= 1 295 jg .addnoise_loop 296 297 ; begin epilog 298 pop rdi 299 pop rsi 300 RESTORE_GOT 301 UNSHADOW_ARGS 302 pop rbp 303 ret 304 305 306 SECTION_RODATA 307 align 16 308 Blur: 309 times 16 dw 16 310 times 8 dw 64 311 times 16 dw 16 312 times 8 dw 0 313 314 rd: 315 times 4 dw 0x40 316