1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12 /**************************************************************************** 13 * 14 * Module Title : scaleopt.cpp 15 * 16 * Description : Optimized scaling functions 17 * 18 ****************************************************************************/ 19 #include "pragmas.h" 20 21 /**************************************************************************** 22 * Module Statics 23 ****************************************************************************/ 24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 }; 25 26 #include "vpx_scale/vpx_scale.h" 27 #include "vpx_mem/vpx_mem.h" 28 29 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 }; 30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 }; 31 32 33 /**************************************************************************** 34 * 35 * ROUTINE : horizontal_line_5_4_scale_mmx 36 * 37 * INPUTS : const unsigned char *source : Pointer to source data. 38 * unsigned int source_width : Stride of source. 39 * unsigned char *dest : Pointer to destination data. 40 * unsigned int dest_width : Stride of destination (NOT USED). 41 * 42 * OUTPUTS : None. 43 * 44 * RETURNS : void 45 * 46 * FUNCTION : Copies horizontal line of pixels from source to 47 * destination scaling up by 4 to 5. 48 * 49 * SPECIAL NOTES : None. 50 * 51 ****************************************************************************/ 52 static 53 void horizontal_line_5_4_scale_mmx 54 ( 55 const unsigned char *source, 56 unsigned int source_width, 57 unsigned char *dest, 58 unsigned int dest_width 59 ) { 60 /* 61 unsigned i; 62 unsigned int a, b, c, d, e; 63 unsigned char *des = dest; 64 const unsigned char *src = source; 65 66 (void) dest_width; 67 68 for ( i=0; i<source_width; i+=5 ) 69 { 70 a = src[0]; 71 b = src[1]; 72 c = src[2]; 73 d = src[3]; 74 e = src[4]; 75 76 des[0] = a; 77 des[1] = ((b*192 + c* 64 + 128)>>8); 78 des[2] = ((c*128 + d*128 + 128)>>8); 79 des[3] = ((d* 64 + e*192 + 128)>>8); 80 81 src += 5; 82 des += 4; 83 } 84 */ 85 (void) dest_width; 86 87 __asm { 88 89 mov esi, source; 90 mov edi, dest; 91 92 mov ecx, source_width; 93 movq mm5, const54_1; 94 95 pxor mm7, mm7; 96 movq mm6, const54_2; 97 98 movq mm4, round_values; 99 lea edx, [esi+ecx]; 100 horizontal_line_5_4_loop: 101 102 movq mm0, QWORD PTR [esi]; 103 00 01 02 03 04 05 06 07 104 movq mm1, mm0; 105 00 01 02 03 04 05 06 07 106 107 psrlq mm0, 8; 108 01 02 03 04 05 06 07 xx 109 punpcklbw mm1, mm7; 110 xx 00 xx 01 xx 02 xx 03 111 112 punpcklbw mm0, mm7; 113 xx 01 xx 02 xx 03 xx 04 114 pmullw mm1, mm5 115 116 pmullw mm0, mm6 117 add esi, 5 118 119 add edi, 4 120 paddw mm1, mm0 121 122 paddw mm1, mm4 123 psrlw mm1, 8 124 125 cmp esi, edx 126 packuswb mm1, mm7 127 128 movd DWORD PTR [edi-4], mm1 129 130 jl horizontal_line_5_4_loop 131 132 } 133 134 } 135 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 }; 136 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 }; 137 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 }; 138 139 static 140 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) { 141 142 __asm { 143 push ebx 144 145 mov esi, source // Get the source and destination pointer 146 mov ecx, src_pitch // Get the pitch size 147 148 mov edi, dest // tow lines below 149 pxor mm7, mm7 // clear out mm7 150 151 mov edx, dest_pitch // Loop counter 152 mov ebx, dest_width 153 154 vs_5_4_loop: 155 156 movd mm0, DWORD ptr [esi] // src[0]; 157 movd mm1, DWORD ptr [esi+ecx] // src[1]; 158 159 movd mm2, DWORD ptr [esi+ecx*2] 160 lea eax, [esi+ecx*2] // 161 162 punpcklbw mm1, mm7 163 punpcklbw mm2, mm7 164 165 movq mm3, mm2 166 pmullw mm1, three_fourths 167 168 pmullw mm2, one_fourths 169 movd mm4, [eax+ecx] 170 171 pmullw mm3, two_fourths 172 punpcklbw mm4, mm7 173 174 movq mm5, mm4 175 pmullw mm4, two_fourths 176 177 paddw mm1, mm2 178 movd mm6, [eax+ecx*2] 179 180 pmullw mm5, one_fourths 181 paddw mm1, round_values; 182 183 paddw mm3, mm4 184 psrlw mm1, 8 185 186 punpcklbw mm6, mm7 187 paddw mm3, round_values 188 189 pmullw mm6, three_fourths 190 psrlw mm3, 8 191 192 packuswb mm1, mm7 193 packuswb mm3, mm7 194 195 movd DWORD PTR [edi], mm0 196 movd DWORD PTR [edi+edx], mm1 197 198 199 paddw mm5, mm6 200 movd DWORD PTR [edi+edx*2], mm3 201 202 lea eax, [edi+edx*2] 203 paddw mm5, round_values 204 205 psrlw mm5, 8 206 add edi, 4 207 208 packuswb mm5, mm7 209 movd DWORD PTR [eax+edx], mm5 210 211 add esi, 4 212 sub ebx, 4 213 214 jg vs_5_4_loop 215 216 pop ebx 217 } 218 } 219 220 221 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 }; 222 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 }; 223 224 225 static 226 void horizontal_line_5_3_scale_mmx 227 ( 228 const unsigned char *source, 229 unsigned int source_width, 230 unsigned char *dest, 231 unsigned int dest_width 232 ) { 233 234 (void) dest_width; 235 __asm { 236 237 mov esi, source; 238 mov edi, dest; 239 240 mov ecx, source_width; 241 movq mm5, const53_1; 242 243 pxor mm7, mm7; 244 movq mm6, const53_2; 245 246 movq mm4, round_values; 247 lea edx, [esi+ecx-5]; 248 horizontal_line_5_3_loop: 249 250 movq mm0, QWORD PTR [esi]; 251 00 01 02 03 04 05 06 07 252 movq mm1, mm0; 253 00 01 02 03 04 05 06 07 254 255 psllw mm0, 8; 256 xx 00 xx 02 xx 04 xx 06 257 psrlw mm1, 8; 258 01 xx 03 xx 05 xx 07 xx 259 260 psrlw mm0, 8; 261 00 xx 02 xx 04 xx 06 xx 262 psllq mm1, 16; 263 xx xx 01 xx 03 xx 05 xx 264 265 pmullw mm0, mm6 266 267 pmullw mm1, mm5 268 add esi, 5 269 270 add edi, 3 271 paddw mm1, mm0 272 273 paddw mm1, mm4 274 psrlw mm1, 8 275 276 cmp esi, edx 277 packuswb mm1, mm7 278 279 movd DWORD PTR [edi-3], mm1 280 jl horizontal_line_5_3_loop 281 282 // exit condition 283 movq mm0, QWORD PTR [esi]; 284 00 01 02 03 04 05 06 07 285 movq mm1, mm0; 286 00 01 02 03 04 05 06 07 287 288 psllw mm0, 8; 289 xx 00 xx 02 xx 04 xx 06 290 psrlw mm1, 8; 291 01 xx 03 xx 05 xx 07 xx 292 293 psrlw mm0, 8; 294 00 xx 02 xx 04 xx 06 xx 295 psllq mm1, 16; 296 xx xx 01 xx 03 xx 05 xx 297 298 pmullw mm0, mm6 299 300 pmullw mm1, mm5 301 paddw mm1, mm0 302 303 paddw mm1, mm4 304 psrlw mm1, 8 305 306 packuswb mm1, mm7 307 movd eax, mm1 308 309 mov edx, eax 310 shr edx, 16 311 312 mov WORD PTR[edi], ax 313 mov BYTE PTR[edi+2], dl 314 315 } 316 317 } 318 319 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 }; 320 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 }; 321 322 static 323 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) { 324 325 __asm { 326 push ebx 327 328 mov esi, source // Get the source and destination pointer 329 mov ecx, src_pitch // Get the pitch size 330 331 mov edi, dest // tow lines below 332 pxor mm7, mm7 // clear out mm7 333 334 mov edx, dest_pitch // Loop counter 335 movq mm5, one_thirds 336 337 movq mm6, two_thirds 338 mov ebx, dest_width; 339 340 vs_5_3_loop: 341 342 movd mm0, DWORD ptr [esi] // src[0]; 343 movd mm1, DWORD ptr [esi+ecx] // src[1]; 344 345 movd mm2, DWORD ptr [esi+ecx*2] 346 lea eax, [esi+ecx*2] // 347 348 punpcklbw mm1, mm7 349 punpcklbw mm2, mm7 350 351 pmullw mm1, mm5 352 pmullw mm2, mm6 353 354 movd mm3, DWORD ptr [eax+ecx] 355 movd mm4, DWORD ptr [eax+ecx*2] 356 357 punpcklbw mm3, mm7 358 punpcklbw mm4, mm7 359 360 pmullw mm3, mm6 361 pmullw mm4, mm5 362 363 364 movd DWORD PTR [edi], mm0 365 paddw mm1, mm2 366 367 paddw mm1, round_values 368 psrlw mm1, 8 369 370 packuswb mm1, mm7 371 paddw mm3, mm4 372 373 paddw mm3, round_values 374 movd DWORD PTR [edi+edx], mm1 375 376 psrlw mm3, 8 377 packuswb mm3, mm7 378 379 movd DWORD PTR [edi+edx*2], mm3 380 381 382 add edi, 4 383 add esi, 4 384 385 sub ebx, 4 386 jg vs_5_3_loop 387 388 pop ebx 389 } 390 } 391 392 393 394 395 /**************************************************************************** 396 * 397 * ROUTINE : horizontal_line_2_1_scale 398 * 399 * INPUTS : const unsigned char *source : 400 * unsigned int source_width : 401 * unsigned char *dest : 402 * unsigned int dest_width : 403 * 404 * OUTPUTS : None. 405 * 406 * RETURNS : void 407 * 408 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels. 409 * 410 * SPECIAL NOTES : None. 411 * 412 ****************************************************************************/ 413 static 414 void horizontal_line_2_1_scale_mmx 415 ( 416 const unsigned char *source, 417 unsigned int source_width, 418 unsigned char *dest, 419 unsigned int dest_width 420 ) { 421 (void) dest_width; 422 (void) source_width; 423 __asm { 424 mov esi, source 425 mov edi, dest 426 427 pxor mm7, mm7 428 mov ecx, dest_width 429 430 xor edx, edx 431 hs_2_1_loop: 432 433 movq mm0, [esi+edx*2] 434 psllw mm0, 8 435 436 psrlw mm0, 8 437 packuswb mm0, mm7 438 439 movd DWORD Ptr [edi+edx], mm0; 440 add edx, 4 441 442 cmp edx, ecx 443 jl hs_2_1_loop 444 445 } 446 } 447 448 449 450 static 451 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) { 452 (void) dest_pitch; 453 (void) src_pitch; 454 vpx_memcpy(dest, source, dest_width); 455 } 456 457 458 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 }; 459 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 }; 460 461 static 462 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) { 463 464 (void) dest_pitch; 465 __asm { 466 mov esi, source 467 mov edi, dest 468 469 mov eax, src_pitch 470 mov edx, dest_width 471 472 pxor mm7, mm7 473 sub esi, eax // back one line 474 475 476 lea ecx, [esi+edx]; 477 movq mm6, round_values; 478 479 movq mm5, three_sixteenths; 480 movq mm4, ten_sixteenths; 481 482 vs_2_1_i_loop: 483 movd mm0, [esi] // 484 movd mm1, [esi+eax] // 485 486 movd mm2, [esi+eax*2] // 487 punpcklbw mm0, mm7 488 489 pmullw mm0, mm5 490 punpcklbw mm1, mm7 491 492 pmullw mm1, mm4 493 punpcklbw mm2, mm7 494 495 pmullw mm2, mm5 496 paddw mm0, round_values 497 498 paddw mm1, mm2 499 paddw mm0, mm1 500 501 psrlw mm0, 8 502 packuswb mm0, mm7 503 504 movd DWORD PTR [edi], mm0 505 add esi, 4 506 507 add edi, 4; 508 cmp esi, ecx 509 jl vs_2_1_i_loop 510 511 } 512 } 513 514 515 516 void 517 register_mmxscalers(void) { 518 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx; 519 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx; 520 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx; 521 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx; 522 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx; 523 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx; 524 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx; 525 } 526