1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 #include "memory.h" 13 #include "preproc.h" 14 #include "pragmas.h" 15 16 /**************************************************************************** 17 * Macros 18 ****************************************************************************/ 19 #define FRAMECOUNT 7 20 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) 21 22 /**************************************************************************** 23 * Imports 24 ****************************************************************************/ 25 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); 26 27 /**************************************************************************** 28 * Exported Global Variables 29 ****************************************************************************/ 30 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); 31 32 /**************************************************************************** 33 * 34 * ROUTINE : temp_filter_wmt 35 * 36 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. 37 * unsigned char *s : Pointer to source frame. 38 * unsigned char *d : Pointer to destination frame. 39 * int bytes : Number of bytes to filter. 40 * int strength : Strength of filter to apply. 41 * 42 * OUTPUTS : None. 43 * 44 * RETURNS : void 45 * 46 * FUNCTION : Performs a closesness adjusted temporarl blur 47 * 48 * SPECIAL NOTES : Destination frame can be same as source frame. 49 * 50 ****************************************************************************/ 51 void temp_filter_wmt 52 ( 53 pre_proc_instance *ppi, 54 unsigned char *s, 55 unsigned char *d, 56 int bytes, 57 int strength 58 ) 59 { 60 int byte = 0; 61 unsigned char *frameptr = ppi->frame_buffer; 62 63 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; 64 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; 65 66 if (ppi->frame == 0) 67 { 68 do 69 { 70 int i; 71 int frame = 0; 72 73 do 74 { 75 for (i = 0; i < 8; i++) 76 { 77 *frameptr = s[byte+i]; 78 ++frameptr; 79 } 80 81 ++frame; 82 } 83 while (frame < FRAMECOUNT); 84 85 for (i = 0; i < 8; i++) 86 d[byte+i] = s[byte+i]; 87 88 byte += 8; 89 90 } 91 while (byte < bytes); 92 } 93 else 94 { 95 int i; 96 int offset2 = (ppi->frame % FRAMECOUNT); 97 98 do 99 { 100 __declspec(align(16)) unsigned short counts[8]; 101 __declspec(align(16)) unsigned short sums[8]; 102 __asm 103 { 104 mov eax, offset2 105 mov edi, s // source pixels 106 pxor xmm1, xmm1 // accumulator 107 108 pxor xmm7, xmm7 109 110 mov esi, frameptr // accumulator 111 pxor xmm2, xmm2 // count 112 113 movq xmm3, QWORD PTR [edi] 114 115 movq QWORD PTR [esi+8*eax], xmm3 116 117 punpcklbw xmm3, xmm2 // xmm3 source pixels 118 mov ecx, FRAMECOUNT 119 120 next_frame: 121 movq xmm4, QWORD PTR [esi] // get frame buffer values 122 punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels 123 movdqa xmm6, xmm4 // save the pixel values 124 psubsw xmm4, xmm3 // subtracted pixel values 125 pmullw xmm4, xmm4 // square xmm4 126 movd xmm5, strength 127 psrlw xmm4, xmm5 // should be strength 128 pmullw xmm4, threes // 3 * modifier 129 movdqa xmm5, sixteens // 16s 130 psubusw xmm5, xmm4 // 16 - modifiers 131 movdqa xmm4, xmm5 // save the modifiers 132 pmullw xmm4, xmm6 // multiplier values 133 paddusw xmm1, xmm4 // accumulator 134 paddusw xmm2, xmm5 // count 135 add esi, 8 // next frame 136 dec ecx // next set of eight pixels 137 jnz next_frame 138 139 movdqa counts, xmm2 140 psrlw xmm2, 1 // divide count by 2 for rounding 141 paddusw xmm1, xmm2 // rounding added in 142 143 mov frameptr, esi 144 145 movdqa sums, xmm1 146 } 147 148 for (i = 0; i < 8; i++) 149 { 150 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; 151 blurvalue >>= 16; 152 d[i] = blurvalue; 153 } 154 155 s += 8; 156 d += 8; 157 byte += 8; 158 } 159 while (byte < bytes); 160 } 161 162 ++ppi->frame; 163 __asm emms 164 } 165 166 /**************************************************************************** 167 * 168 * ROUTINE : temp_filter_mmx 169 * 170 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. 171 * unsigned char *s : Pointer to source frame. 172 * unsigned char *d : Pointer to destination frame. 173 * int bytes : Number of bytes to filter. 174 * int strength : Strength of filter to apply. 175 * 176 * OUTPUTS : None. 177 * 178 * RETURNS : void 179 * 180 * FUNCTION : Performs a closesness adjusted temporarl blur 181 * 182 * SPECIAL NOTES : Destination frame can be same as source frame. 183 * 184 ****************************************************************************/ 185 void temp_filter_mmx 186 ( 187 pre_proc_instance *ppi, 188 unsigned char *s, 189 unsigned char *d, 190 int bytes, 191 int strength 192 ) 193 { 194 int byte = 0; 195 unsigned char *frameptr = ppi->frame_buffer; 196 197 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; 198 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; 199 200 if (ppi->frame == 0) 201 { 202 do 203 { 204 int i; 205 int frame = 0; 206 207 do 208 { 209 for (i = 0; i < 4; i++) 210 { 211 *frameptr = s[byte+i]; 212 ++frameptr; 213 } 214 215 ++frame; 216 } 217 while (frame < FRAMECOUNT); 218 219 for (i = 0; i < 4; i++) 220 d[byte+i] = s[byte+i]; 221 222 byte += 4; 223 224 } 225 while (byte < bytes); 226 } 227 else 228 { 229 int i; 230 int offset2 = (ppi->frame % FRAMECOUNT); 231 232 do 233 { 234 __declspec(align(16)) unsigned short counts[8]; 235 __declspec(align(16)) unsigned short sums[8]; 236 __asm 237 { 238 239 mov eax, offset2 240 mov edi, s // source pixels 241 pxor mm1, mm1 // accumulator 242 pxor mm7, mm7 243 244 mov esi, frameptr // accumulator 245 pxor mm2, mm2 // count 246 247 movd mm3, DWORD PTR [edi] 248 movd DWORD PTR [esi+4*eax], mm3 249 250 punpcklbw mm3, mm2 // mm3 source pixels 251 mov ecx, FRAMECOUNT 252 253 next_frame: 254 movd mm4, DWORD PTR [esi] // get frame buffer values 255 punpcklbw mm4, mm7 // mm4 frame buffer pixels 256 movq mm6, mm4 // save the pixel values 257 psubsw mm4, mm3 // subtracted pixel values 258 pmullw mm4, mm4 // square mm4 259 movd mm5, strength 260 psrlw mm4, mm5 // should be strength 261 pmullw mm4, threes // 3 * modifier 262 movq mm5, sixteens // 16s 263 psubusw mm5, mm4 // 16 - modifiers 264 movq mm4, mm5 // save the modifiers 265 pmullw mm4, mm6 // multiplier values 266 paddusw mm1, mm4 // accumulator 267 paddusw mm2, mm5 // count 268 add esi, 4 // next frame 269 dec ecx // next set of eight pixels 270 jnz next_frame 271 272 movq counts, mm2 273 psrlw mm2, 1 // divide count by 2 for rounding 274 paddusw mm1, mm2 // rounding added in 275 276 mov frameptr, esi 277 278 movq sums, mm1 279 280 } 281 282 for (i = 0; i < 4; i++) 283 { 284 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; 285 blurvalue >>= 16; 286 d[i] = blurvalue; 287 } 288 289 s += 4; 290 d += 4; 291 byte += 4; 292 } 293 while (byte < bytes); 294 } 295 296 ++ppi->frame; 297 __asm emms 298 } 299