Home | History | Annotate | Download | only in win32
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 /****************************************************************************
     13 *
     14 *   Module Title :     scaleopt.cpp
     15 *
     16 *   Description  :     Optimized scaling functions
     17 *
     18 ****************************************************************************/
     19 #include "pragmas.h"
     20 
     21 
     22 
     23 /****************************************************************************
     24 *  Module Statics
     25 ****************************************************************************/
     26 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
     27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
     28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
     29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
     30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
     31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
     32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
     33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
     34 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
     35 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
     36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
     37 
     38 
     39 
     40 #include "vpx_scale/vpxscale.h"
     41 #include "vpx_mem/vpx_mem.h"
     42 
     43 /****************************************************************************
     44  *
     45  *  ROUTINE       : horizontal_line_3_5_scale_mmx
     46  *
     47  *  INPUTS        : const unsigned char *source :
     48  *                  unsigned int source_width    :
     49  *                  unsigned char *dest         :
     50  *                  unsigned int dest_width      :
     51  *
     52  *  OUTPUTS       : None.
     53  *
     54  *  RETURNS       : void
     55  *
     56  *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
     57  *
     58  *  SPECIAL NOTES : None.
     59  *
     60  ****************************************************************************/
     61 static
     62 void horizontal_line_3_5_scale_mmx
     63 (
     64     const unsigned char *source,
     65     unsigned int source_width,
     66     unsigned char *dest,
     67     unsigned int dest_width
     68 )
     69 {
     70     (void) dest_width;
     71 
     72     __asm
     73     {
     74 
     75         push ebx
     76 
     77         mov         esi,    source
     78         mov         edi,    dest
     79 
     80         mov         ecx,    source_width
     81         lea         edx,    [esi+ecx-3];
     82 
     83         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
     84         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
     85 
     86         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
     87         pxor        mm7,    mm7             // clear mm7
     88 
     89         horiz_line_3_5_loop:
     90 
     91         mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
     92         mov        ebx,    eax
     93 
     94         and         ebx,    0xffff00        // ebx = xx 01 02 xx
     95         mov         ecx,    eax             // ecx = 00 01 02 03
     96 
     97         and         eax,    0xffff0000      // eax = xx xx 02 03
     98         xor         ecx,    eax             // ecx = 00 01 xx xx
     99 
    100         shr         ebx,    8               // ebx = 01 02 xx xx
    101         or          eax,    ebx             // eax = 01 02 02 03
    102 
    103         shl         ebx,    16              // ebx = xx xx 01 02
    104         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
    105 
    106         or          ebx,    ecx             // ebx = 00 01 01 02
    107         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
    108 
    109         movd        mm0,    ebx             // mm0 = 00 01 01 02
    110         pmullw      mm1,    mm6             //
    111 
    112         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    113         pmullw      mm0,    mm5             //
    114 
    115         mov         [edi],  ebx             // writeoutput 00 xx xx xx
    116         add         esi,    3
    117 
    118         add         edi,    5
    119         paddw       mm0,    mm1
    120 
    121         paddw       mm0,    mm4
    122         psrlw       mm0,    8
    123 
    124         cmp         esi,    edx
    125         packuswb    mm0,    mm7
    126 
    127         movd        DWORD Ptr [edi-4], mm0
    128         jl          horiz_line_3_5_loop
    129 
    130 //Exit:
    131         mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
    132         mov         ebx,    eax
    133 
    134         and         ebx,    0xffff00        // ebx = xx 01 02 xx
    135         mov         ecx,    eax             // ecx = 00 01 02 03
    136 
    137         and         eax,    0xffff0000      // eax = xx xx 02 03
    138         xor         ecx,    eax             // ecx = 00 01 xx xx
    139 
    140         shr         ebx,    8               // ebx = 01 02 xx xx
    141         or          eax,    ebx             // eax = 01 02 02 03
    142 
    143         shl         eax,    8               // eax = xx 01 02 02
    144         and         eax,    0xffff0000      // eax = xx xx 02 02
    145 
    146         or          eax,    ebx             // eax = 01 02 02 02
    147 
    148         shl         ebx,    16              // ebx = xx xx 01 02
    149         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
    150 
    151         or          ebx,    ecx             // ebx = 00 01 01 02
    152         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
    153 
    154         movd        mm0,    ebx             // mm0 = 00 01 01 02
    155         pmullw      mm1,    mm6             //
    156 
    157         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    158         pmullw      mm0,    mm5             //
    159 
    160         mov         [edi],  ebx             // writeoutput 00 xx xx xx
    161         paddw       mm0,    mm1
    162 
    163         paddw       mm0,    mm4
    164         psrlw       mm0,    8
    165 
    166         packuswb    mm0,    mm7
    167         movd        DWORD Ptr [edi+1], mm0
    168 
    169         pop ebx
    170 
    171     }
    172 
    173 }
    174 
    175 
    176 /****************************************************************************
    177  *
    178  *  ROUTINE       : horizontal_line_4_5_scale_mmx
    179  *
    180  *  INPUTS        : const unsigned char *source :
    181  *                  unsigned int source_width    :
    182  *                  unsigned char *dest         :
    183  *                  unsigned int dest_width      :
    184  *
    185  *  OUTPUTS       : None.
    186  *
    187  *  RETURNS       : void
    188  *
    189  *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
    190  *
    191  *  SPECIAL NOTES : None.
    192  *
    193  ****************************************************************************/
    194 static
    195 void horizontal_line_4_5_scale_mmx
    196 (
    197     const unsigned char *source,
    198     unsigned int source_width,
    199     unsigned char *dest,
    200     unsigned int dest_width
    201 )
    202 {
    203     (void)dest_width;
    204 
    205     __asm
    206     {
    207 
    208         mov         esi,    source
    209         mov         edi,    dest
    210 
    211         mov         ecx,    source_width
    212         lea         edx,    [esi+ecx-8];
    213 
    214         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
    215         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
    216 
    217         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
    218         pxor        mm7,    mm7             // clear mm7
    219 
    220         horiz_line_4_5_loop:
    221 
    222         movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
    223         movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
    224 
    225         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    226         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
    227 
    228         movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
    229         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    230 
    231         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    232         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    233 
    234         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    235         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    236 
    237         movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
    238         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    239 
    240         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    241         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
    242 
    243         paddw       mm0,    mm1             // added round values
    244         paddw       mm0,    mm4
    245 
    246         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    247         packuswb    mm0,    mm7
    248 
    249         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    250         add         edi,    10
    251 
    252         add         esi,    8
    253         paddw       mm2,    mm3             //
    254 
    255         paddw       mm2,    mm4             // added round values
    256         cmp         esi,    edx
    257 
    258         psrlw       mm2,    8
    259         packuswb    mm2,    mm7
    260 
    261         movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
    262         jl         horiz_line_4_5_loop
    263 
    264 //Exit:
    265         movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
    266         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
    267 
    268         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    269         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
    270 
    271         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
    272         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
    273 
    274         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
    275         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
    276 
    277         movq        mm3,    mm1
    278 
    279         movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
    280         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    281 
    282         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    283         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    284 
    285         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    286         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    287 
    288         movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
    289         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    290 
    291         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    292         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
    293 
    294         paddw       mm0,    mm1             // added round values
    295         paddw       mm0,    mm4
    296 
    297         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    298         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
    299 
    300         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    301         paddw       mm2,    mm3             //
    302 
    303         paddw       mm2,    mm4             // added round values
    304         psrlw       mm2,    8
    305 
    306         packuswb    mm2,    mm7
    307         movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
    308 
    309 
    310     }
    311 }
    312 
    313 /****************************************************************************
    314  *
    315  *  ROUTINE       : vertical_band_4_5_scale_mmx
    316  *
    317  *  INPUTS        : unsigned char *dest    :
    318  *                  unsigned int dest_pitch :
    319  *                  unsigned int dest_width :
    320  *
    321  *  OUTPUTS       : None.
    322  *
    323  *  RETURNS       : void
    324  *
    325  *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
    326  *
    327  *  SPECIAL NOTES : The routine uses the first line of the band below
    328  *                  the current band. The function also has a "C" only
    329  *                  version.
    330  *
    331  ****************************************************************************/
    332 static
    333 void vertical_band_4_5_scale_mmx
    334 (
    335     unsigned char *dest,
    336     unsigned int dest_pitch,
    337     unsigned int dest_width
    338 )
    339 {
    340     __asm
    341     {
    342 
    343         mov         esi,    dest                    // Get the source and destination pointer
    344         mov         ecx,    dest_pitch               // Get the pitch size
    345 
    346         lea         edi,    [esi+ecx*2]             // tow lines below
    347         add         edi,    ecx                     // three lines below
    348 
    349         pxor        mm7,    mm7                     // clear out mm7
    350         mov         edx,    dest_width               // Loop counter
    351 
    352         vs_4_5_loop:
    353 
    354         movq        mm0,    QWORD ptr [esi]         // src[0];
    355         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    356 
    357         movq        mm2,    mm0                     // Make a copy
    358         punpcklbw   mm0,    mm7                     // unpack low to word
    359 
    360         movq        mm5,    one_fifth
    361         punpckhbw   mm2,    mm7                     // unpack high to word
    362 
    363         pmullw      mm0,    mm5                     // a * 1/5
    364 
    365         movq        mm3,    mm1                     // make a copy
    366         punpcklbw   mm1,    mm7                     // unpack low to word
    367 
    368         pmullw      mm2,    mm5                     // a * 1/5
    369         movq        mm6,    four_fifths               // constan
    370 
    371         movq        mm4,    mm1                     // copy of low b
    372         pmullw      mm4,    mm6                     // b * 4/5
    373 
    374         punpckhbw   mm3,    mm7                     // unpack high to word
    375         movq        mm5,    mm3                     // copy of high b
    376 
    377         pmullw      mm5,    mm6                     // b * 4/5
    378         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    379 
    380         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    381         paddw       mm0,    round_values             // + 128
    382 
    383         paddw       mm2,    round_values             // + 128
    384         psrlw       mm0,    8
    385 
    386         psrlw       mm2,    8
    387         packuswb    mm0,    mm2                     // des [1]
    388 
    389         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    390         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    391 
    392         // mm1, mm3 --- Src[1]
    393         // mm0 --- Src[2]
    394         // mm7 for unpacking
    395 
    396         movq        mm5,    two_fifths
    397         movq        mm2,    mm0                     // make a copy
    398 
    399         pmullw      mm1,    mm5                     // b * 2/5
    400         movq        mm6,    three_fifths
    401 
    402 
    403         punpcklbw   mm0,    mm7                     // unpack low to word
    404         pmullw      mm3,    mm5                     // b * 2/5
    405 
    406         movq        mm4,    mm0                     // make copy of c
    407         punpckhbw   mm2,    mm7                     // unpack high to word
    408 
    409         pmullw      mm4,    mm6                     // c * 3/5
    410         movq        mm5,    mm2
    411 
    412         pmullw      mm5,    mm6                     // c * 3/5
    413         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    414 
    415         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    416         paddw       mm1,    round_values             // + 128
    417 
    418         paddw       mm3,    round_values             // + 128
    419         psrlw       mm1,    8
    420 
    421         psrlw       mm3,    8
    422         packuswb    mm1,    mm3                     // des[2]
    423 
    424         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    425         movq        mm1,    [edi]                   // mm1=Src[3];
    426 
    427         // mm0, mm2 --- Src[2]
    428         // mm1 --- Src[3]
    429         // mm6 --- 3/5
    430         // mm7 for unpacking
    431 
    432         pmullw      mm0,    mm6                     // c * 3/5
    433         movq        mm5,    two_fifths               // mm5 = 2/5
    434 
    435         movq        mm3,    mm1                     // make a copy
    436         pmullw      mm2,    mm6                     // c * 3/5
    437 
    438         punpcklbw   mm1,    mm7                     // unpack low
    439         movq        mm4,    mm1                     // make a copy
    440 
    441         punpckhbw   mm3,    mm7                     // unpack high
    442         pmullw      mm4,    mm5                     // d * 2/5
    443 
    444         movq        mm6,    mm3                     // make a copy
    445         pmullw      mm6,    mm5                     // d * 2/5
    446 
    447         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    448         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    449 
    450         paddw       mm0,    round_values             // + 128
    451         paddw       mm2,    round_values             // + 128
    452 
    453         psrlw       mm0,    8
    454         psrlw       mm2,    8
    455 
    456         packuswb    mm0,    mm2                     // des[3]
    457         movq        QWORD ptr [edi], mm0            // write des[3]
    458 
    459         //  mm1, mm3 --- Src[3]
    460         //  mm7 -- cleared for unpacking
    461 
    462         movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
    463 
    464         movq        mm5,    four_fifths              // mm5 = 4/5
    465         pmullw      mm1,    mm5                     // d * 4/5
    466 
    467         movq        mm6,    one_fifth                // mm6 = 1/5
    468         movq        mm2,    mm0                     // make a copy
    469 
    470         pmullw      mm3,    mm5                     // d * 4/5
    471         punpcklbw   mm0,    mm7                     // unpack low
    472 
    473         pmullw      mm0,    mm6                     // an * 1/5
    474         punpckhbw   mm2,    mm7                     // unpack high
    475 
    476         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
    477         pmullw      mm2,    mm6                     // an * 1/5
    478 
    479         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
    480         paddw       mm1,    round_values             // + 128
    481 
    482         paddw       mm3,    round_values             // + 128
    483         psrlw       mm1,    8
    484 
    485         psrlw       mm3,    8
    486         packuswb    mm1,    mm3                     // des[4]
    487 
    488         movq        QWORD ptr [edi+ecx], mm1        // write des[4]
    489 
    490         add         edi,    8
    491         add         esi,    8
    492 
    493         sub         edx,    8
    494         jg         vs_4_5_loop
    495     }
    496 }
    497 
    498 /****************************************************************************
    499  *
    500  *  ROUTINE       : last_vertical_band_4_5_scale_mmx
    501  *
    502  *  INPUTS        : unsigned char *dest    :
    503  *                  unsigned int dest_pitch :
    504  *                  unsigned int dest_width :
    505  *
    506  *  OUTPUTS       : None.
    507  *
    508  *  RETURNS       : None
    509  *
    510  *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
    511  *
    512  *  SPECIAL NOTES : The routine uses the first line of the band below
    513  *                  the current band. The function also has an "C" only
    514  *                  version.
    515  *
    516  ****************************************************************************/
    517 static
    518 void last_vertical_band_4_5_scale_mmx
    519 (
    520     unsigned char *dest,
    521     unsigned int dest_pitch,
    522     unsigned int dest_width
    523 )
    524 {
    525     __asm
    526     {
    527         mov         esi,    dest                    // Get the source and destination pointer
    528         mov         ecx,    dest_pitch               // Get the pitch size
    529 
    530         lea         edi,    [esi+ecx*2]             // tow lines below
    531         add         edi,    ecx                     // three lines below
    532 
    533         pxor        mm7,    mm7                     // clear out mm7
    534         mov         edx,    dest_width               // Loop counter
    535 
    536         last_vs_4_5_loop:
    537 
    538         movq        mm0,    QWORD ptr [esi]         // src[0];
    539         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    540 
    541         movq        mm2,    mm0                     // Make a copy
    542         punpcklbw   mm0,    mm7                     // unpack low to word
    543 
    544         movq        mm5,    one_fifth
    545         punpckhbw   mm2,    mm7                     // unpack high to word
    546 
    547         pmullw      mm0,    mm5                     // a * 1/5
    548 
    549         movq        mm3,    mm1                     // make a copy
    550         punpcklbw   mm1,    mm7                     // unpack low to word
    551 
    552         pmullw      mm2,    mm5                     // a * 1/5
    553         movq        mm6,    four_fifths               // constan
    554 
    555         movq        mm4,    mm1                     // copy of low b
    556         pmullw      mm4,    mm6                     // b * 4/5
    557 
    558         punpckhbw   mm3,    mm7                     // unpack high to word
    559         movq        mm5,    mm3                     // copy of high b
    560 
    561         pmullw      mm5,    mm6                     // b * 4/5
    562         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    563 
    564         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    565         paddw       mm0,    round_values             // + 128
    566 
    567         paddw       mm2,    round_values             // + 128
    568         psrlw       mm0,    8
    569 
    570         psrlw       mm2,    8
    571         packuswb    mm0,    mm2                     // des [1]
    572 
    573         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    574         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    575 
    576         // mm1, mm3 --- Src[1]
    577         // mm0 --- Src[2]
    578         // mm7 for unpacking
    579 
    580         movq        mm5,    two_fifths
    581         movq        mm2,    mm0                     // make a copy
    582 
    583         pmullw      mm1,    mm5                     // b * 2/5
    584         movq        mm6,    three_fifths
    585 
    586 
    587         punpcklbw   mm0,    mm7                     // unpack low to word
    588         pmullw      mm3,    mm5                     // b * 2/5
    589 
    590         movq        mm4,    mm0                     // make copy of c
    591         punpckhbw   mm2,    mm7                     // unpack high to word
    592 
    593         pmullw      mm4,    mm6                     // c * 3/5
    594         movq        mm5,    mm2
    595 
    596         pmullw      mm5,    mm6                     // c * 3/5
    597         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    598 
    599         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    600         paddw       mm1,    round_values             // + 128
    601 
    602         paddw       mm3,    round_values             // + 128
    603         psrlw       mm1,    8
    604 
    605         psrlw       mm3,    8
    606         packuswb    mm1,    mm3                     // des[2]
    607 
    608         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    609         movq        mm1,    [edi]                   // mm1=Src[3];
    610 
    611         movq        QWORD ptr [edi+ecx], mm1        // write des[4];
    612 
    613         // mm0, mm2 --- Src[2]
    614         // mm1 --- Src[3]
    615         // mm6 --- 3/5
    616         // mm7 for unpacking
    617 
    618         pmullw      mm0,    mm6                     // c * 3/5
    619         movq        mm5,    two_fifths               // mm5 = 2/5
    620 
    621         movq        mm3,    mm1                     // make a copy
    622         pmullw      mm2,    mm6                     // c * 3/5
    623 
    624         punpcklbw   mm1,    mm7                     // unpack low
    625         movq        mm4,    mm1                     // make a copy
    626 
    627         punpckhbw   mm3,    mm7                     // unpack high
    628         pmullw      mm4,    mm5                     // d * 2/5
    629 
    630         movq        mm6,    mm3                     // make a copy
    631         pmullw      mm6,    mm5                     // d * 2/5
    632 
    633         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    634         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    635 
    636         paddw       mm0,    round_values             // + 128
    637         paddw       mm2,    round_values             // + 128
    638 
    639         psrlw       mm0,    8
    640         psrlw       mm2,    8
    641 
    642         packuswb    mm0,    mm2                     // des[3]
    643         movq        QWORD ptr [edi], mm0            // write des[3]
    644 
    645         //  mm1, mm3 --- Src[3]
    646         //  mm7 -- cleared for unpacking
    647         add         edi,    8
    648         add         esi,    8
    649 
    650         sub         edx,    8
    651         jg          last_vs_4_5_loop
    652     }
    653 }
    654 
    655 /****************************************************************************
    656  *
    657  *  ROUTINE       : vertical_band_3_5_scale_mmx
    658  *
    659  *  INPUTS        : unsigned char *dest    :
    660  *                  unsigned int dest_pitch :
    661  *                  unsigned int dest_width :
    662  *
    663  *  OUTPUTS       : None.
    664  *
    665  *  RETURNS       : void
    666  *
    667  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    668  *
    669  *  SPECIAL NOTES : The routine uses the first line of the band below
    670  *                  the current band. The function also has an "C" only
    671  *                  version.
    672  *
    673  ****************************************************************************/
    674 static
    675 void vertical_band_3_5_scale_mmx
    676 (
    677     unsigned char *dest,
    678     unsigned int dest_pitch,
    679     unsigned int dest_width
    680 )
    681 {
    682     __asm
    683     {
    684         mov         esi,    dest                    // Get the source and destination pointer
    685         mov         ecx,    dest_pitch               // Get the pitch size
    686 
    687         lea         edi,    [esi+ecx*2]             // tow lines below
    688         add         edi,    ecx                     // three lines below
    689 
    690         pxor        mm7,    mm7                     // clear out mm7
    691         mov         edx,    dest_width               // Loop counter
    692 
    693         vs_3_5_loop:
    694 
    695         movq        mm0,    QWORD ptr [esi]         // src[0];
    696         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    697 
    698         movq        mm2,    mm0                     // Make a copy
    699         punpcklbw   mm0,    mm7                     // unpack low to word
    700 
    701         movq        mm5,    two_fifths               // mm5 = 2/5
    702         punpckhbw   mm2,    mm7                     // unpack high to word
    703 
    704         pmullw      mm0,    mm5                     // a * 2/5
    705 
    706         movq        mm3,    mm1                     // make a copy
    707         punpcklbw   mm1,    mm7                     // unpack low to word
    708 
    709         pmullw      mm2,    mm5                     // a * 2/5
    710         movq        mm6,    three_fifths             // mm6 = 3/5
    711 
    712         movq        mm4,    mm1                     // copy of low b
    713         pmullw      mm4,    mm6                     // b * 3/5
    714 
    715         punpckhbw   mm3,    mm7                     // unpack high to word
    716         movq        mm5,    mm3                     // copy of high b
    717 
    718         pmullw      mm5,    mm6                     // b * 3/5
    719         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    720 
    721         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    722         paddw       mm0,    round_values             // + 128
    723 
    724         paddw       mm2,    round_values             // + 128
    725         psrlw       mm0,    8
    726 
    727         psrlw       mm2,    8
    728         packuswb    mm0,    mm2                     // des [1]
    729 
    730         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    731         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    732 
    733         // mm1, mm3 --- Src[1]
    734         // mm0 --- Src[2]
    735         // mm7 for unpacking
    736 
    737         movq        mm4,    mm1                     // b low
    738         pmullw      mm1,    four_fifths              // b * 4/5 low
    739 
    740         movq        mm5,    mm3                     // b high
    741         pmullw      mm3,    four_fifths              // b * 4/5 high
    742 
    743         movq        mm2,    mm0                     // c
    744         pmullw      mm4,    one_fifth                // b * 1/5
    745 
    746         punpcklbw   mm0,    mm7                     // c low
    747         pmullw      mm5,    one_fifth                // b * 1/5
    748 
    749         movq        mm6,    mm0                     // make copy of c low
    750         punpckhbw   mm2,    mm7                     // c high
    751 
    752         pmullw      mm6,    one_fifth                // c * 1/5 low
    753         movq        mm7,    mm2                     // make copy of c high
    754 
    755         pmullw      mm7,    one_fifth                // c * 1/5 high
    756         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
    757 
    758         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    759         movq        mm6,    mm0                     // make copy of c low
    760 
    761         pmullw      mm6,    four_fifths              // c * 4/5 low
    762         movq        mm7,    mm2                     // make copy of c high
    763 
    764         pmullw      mm7,    four_fifths              // c * 4/5 high
    765 
    766         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    767         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
    768 
    769         paddw       mm1,    round_values             // + 128
    770         paddw       mm3,    round_values             // + 128
    771 
    772         psrlw       mm1,    8
    773         psrlw       mm3,    8
    774 
    775         packuswb    mm1,    mm3                     // des[2]
    776         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    777 
    778         paddw       mm4,    round_values             // + 128
    779         paddw       mm5,    round_values             // + 128
    780 
    781         psrlw       mm4,    8
    782         psrlw       mm5,    8
    783 
    784         packuswb    mm4,    mm5                     // des[3]
    785         movq        QWORD ptr [edi], mm4            // write des[3]
    786 
    787         //  mm0, mm2 --- Src[3]
    788 
    789         pxor        mm7,    mm7                     // clear mm7 for unpacking
    790         movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
    791 
    792         movq        mm5,    three_fifths             // mm5 = 3/5
    793         pmullw      mm0,    mm5                     // d * 3/5
    794 
    795         movq        mm6,    two_fifths                // mm6 = 2/5
    796         movq        mm3,    mm1                     // make a copy
    797 
    798         pmullw      mm2,    mm5                     // d * 3/5
    799         punpcklbw   mm1,    mm7                     // unpack low
    800 
    801         pmullw      mm1,    mm6                     // an * 2/5
    802         punpckhbw   mm3,    mm7                     // unpack high
    803 
    804         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
    805         pmullw      mm3,    mm6                     // an * 2/5
    806 
    807         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
    808         paddw       mm0,    round_values             // + 128
    809 
    810         paddw       mm2,    round_values             // + 128
    811         psrlw       mm0,    8
    812 
    813         psrlw       mm2,    8
    814         packuswb    mm0,    mm2                     // des[4]
    815 
    816         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
    817 
    818         add         edi,    8
    819         add         esi,    8
    820 
    821         sub         edx,    8
    822         jg          vs_3_5_loop
    823     }
    824 }
    825 
    826 /****************************************************************************
    827  *
    828  *  ROUTINE       : last_vertical_band_3_5_scale_mmx
    829  *
    830  *  INPUTS        : unsigned char *dest    :
    831  *                  unsigned int dest_pitch :
    832  *                  unsigned int dest_width :
    833  *
    834  *  OUTPUTS       : None.
    835  *
    836  *  RETURNS       : void
    837  *
    838  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    839  *
    840  *  SPECIAL NOTES : The routine uses the first line of the band below
    841  *                  the current band. The function also has an "C" only
    842  *                  version.
    843  *
    844  ****************************************************************************/
    845 static
    846 void last_vertical_band_3_5_scale_mmx
    847 (
    848     unsigned char *dest,
    849     unsigned int dest_pitch,
    850     unsigned int dest_width
    851 )
    852 {
    853     __asm
    854     {
    855         mov         esi,    dest                    // Get the source and destination pointer
    856         mov         ecx,    dest_pitch               // Get the pitch size
    857 
    858         lea         edi,    [esi+ecx*2]             // tow lines below
    859         add         edi,    ecx                     // three lines below
    860 
    861         pxor        mm7,    mm7                     // clear out mm7
    862         mov         edx,    dest_width               // Loop counter
    863 
    864 
    865         last_vs_3_5_loop:
    866 
    867         movq        mm0,    QWORD ptr [esi]         // src[0];
    868         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    869 
    870         movq        mm2,    mm0                     // Make a copy
    871         punpcklbw   mm0,    mm7                     // unpack low to word
    872 
    873         movq        mm5,    two_fifths               // mm5 = 2/5
    874         punpckhbw   mm2,    mm7                     // unpack high to word
    875 
    876         pmullw      mm0,    mm5                     // a * 2/5
    877 
    878         movq        mm3,    mm1                     // make a copy
    879         punpcklbw   mm1,    mm7                     // unpack low to word
    880 
    881         pmullw      mm2,    mm5                     // a * 2/5
    882         movq        mm6,    three_fifths             // mm6 = 3/5
    883 
    884         movq        mm4,    mm1                     // copy of low b
    885         pmullw      mm4,    mm6                     // b * 3/5
    886 
    887         punpckhbw   mm3,    mm7                     // unpack high to word
    888         movq        mm5,    mm3                     // copy of high b
    889 
    890         pmullw      mm5,    mm6                     // b * 3/5
    891         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    892 
    893         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    894         paddw       mm0,    round_values             // + 128
    895 
    896         paddw       mm2,    round_values             // + 128
    897         psrlw       mm0,    8
    898 
    899         psrlw       mm2,    8
    900         packuswb    mm0,    mm2                     // des [1]
    901 
    902         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    903         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    904 
    905 
    906 
    907         // mm1, mm3 --- Src[1]
    908         // mm0 --- Src[2]
    909         // mm7 for unpacking
    910 
    911         movq        mm4,    mm1                     // b low
    912         pmullw      mm1,    four_fifths              // b * 4/5 low
    913 
    914         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
    915 
    916         movq        mm5,    mm3                     // b high
    917         pmullw      mm3,    four_fifths              // b * 4/5 high
    918 
    919         movq        mm2,    mm0                     // c
    920         pmullw      mm4,    one_fifth                // b * 1/5
    921 
    922         punpcklbw   mm0,    mm7                     // c low
    923         pmullw      mm5,    one_fifth                // b * 1/5
    924 
    925         movq        mm6,    mm0                     // make copy of c low
    926         punpckhbw   mm2,    mm7                     // c high
    927 
    928         pmullw      mm6,    one_fifth                // c * 1/5 low
    929         movq        mm7,    mm2                     // make copy of c high
    930 
    931         pmullw      mm7,    one_fifth                // c * 1/5 high
    932         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
    933 
    934         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    935         movq        mm6,    mm0                     // make copy of c low
    936 
    937         pmullw      mm6,    four_fifths              // c * 4/5 low
    938         movq        mm7,    mm2                     // make copy of c high
    939 
    940         pmullw      mm7,    four_fifths              // c * 4/5 high
    941 
    942         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    943         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
    944 
    945         paddw       mm1,    round_values             // + 128
    946         paddw       mm3,    round_values             // + 128
    947 
    948         psrlw       mm1,    8
    949         psrlw       mm3,    8
    950 
    951         packuswb    mm1,    mm3                     // des[2]
    952         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    953 
    954         paddw       mm4,    round_values             // + 128
    955         paddw       mm5,    round_values             // + 128
    956 
    957         psrlw       mm4,    8
    958         psrlw       mm5,    8
    959 
    960         packuswb    mm4,    mm5                     // des[3]
    961         movq        QWORD ptr [edi], mm4            // write des[3]
    962 
    963         //  mm0, mm2 --- Src[3]
    964 
    965         add         edi,    8
    966         add         esi,    8
    967 
    968         sub         edx,    8
    969         jg          last_vs_3_5_loop
    970     }
    971 }
    972 
    973 /****************************************************************************
    974  *
    975  *  ROUTINE       : vertical_band_1_2_scale_mmx
    976  *
    977  *  INPUTS        : unsigned char *dest    :
    978  *                  unsigned int dest_pitch :
    979  *                  unsigned int dest_width :
    980  *
    981  *  OUTPUTS       : None.
    982  *
    983  *  RETURNS       : void
    984  *
    985  *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
    986  *
    987  *  SPECIAL NOTES : The routine uses the first line of the band below
    988  *                  the current band. The function also has an "C" only
    989  *                  version.
    990  *
    991  ****************************************************************************/
    992 static
    993 void vertical_band_1_2_scale_mmx
    994 (
    995     unsigned char *dest,
    996     unsigned int dest_pitch,
    997     unsigned int dest_width
    998 )
    999 {
   1000     __asm
   1001     {
   1002 
   1003         mov         esi,    dest                    // Get the source and destination pointer
   1004         mov         ecx,    dest_pitch               // Get the pitch size
   1005 
   1006         pxor        mm7,    mm7                     // clear out mm7
   1007         mov         edx,    dest_width               // Loop counter
   1008 
   1009         vs_1_2_loop:
   1010 
   1011         movq        mm0,    [esi]                   // get Src[0]
   1012         movq        mm1,    [esi + ecx * 2]         // get Src[1]
   1013 
   1014         movq        mm2,    mm0                     // make copy before unpack
   1015         movq        mm3,    mm1                     // make copy before unpack
   1016 
   1017         punpcklbw   mm0,    mm7                     // low Src[0]
   1018         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
   1019 
   1020         punpcklbw   mm1,    mm7                     // low Src[1]
   1021         paddw       mm0,    mm1                     // low (a + b)
   1022 
   1023         punpckhbw   mm2,    mm7                     // high Src[0]
   1024         paddw       mm0,    mm6                     // low (a + b + 1)
   1025 
   1026         punpckhbw   mm3,    mm7
   1027         paddw       mm2,    mm3                     // high (a + b )
   1028 
   1029         psraw       mm0,    1                       // low (a + b +1 )/2
   1030         paddw       mm2,    mm6                     // high (a + b + 1)
   1031 
   1032         psraw       mm2,    1                       // high (a + b + 1)/2
   1033         packuswb    mm0,    mm2                     // pack results
   1034 
   1035         movq        [esi+ecx], mm0                  // write out eight bytes
   1036         add         esi,    8
   1037 
   1038         sub         edx,    8
   1039         jg          vs_1_2_loop
   1040     }
   1041 
   1042 }
   1043 
   1044 /****************************************************************************
   1045  *
   1046  *  ROUTINE       : last_vertical_band_1_2_scale_mmx
   1047  *
   1048  *  INPUTS        : unsigned char *dest    :
   1049  *                  unsigned int dest_pitch :
   1050  *                  unsigned int dest_width :
   1051  *
   1052  *  OUTPUTS       : None.
   1053  *
   1054  *  RETURNS       : void
   1055  *
   1056  *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
   1057  *
   1058  *  SPECIAL NOTES : The routine uses the first line of the band below
   1059  *                  the current band. The function also has an "C" only
   1060  *                  version.
   1061  *
   1062  ****************************************************************************/
   1063 static
   1064 void last_vertical_band_1_2_scale_mmx
   1065 (
   1066     unsigned char *dest,
   1067     unsigned int dest_pitch,
   1068     unsigned int dest_width
   1069 )
   1070 {
   1071     __asm
   1072     {
   1073         mov         esi,    dest                    // Get the source and destination pointer
   1074         mov         ecx,    dest_pitch               // Get the pitch size
   1075 
   1076         mov         edx,    dest_width               // Loop counter
   1077 
   1078         last_vs_1_2_loop:
   1079 
   1080         movq        mm0,    [esi]                   // get Src[0]
   1081         movq        [esi+ecx], mm0                  // write out eight bytes
   1082 
   1083         add         esi,    8
   1084         sub         edx,    8
   1085 
   1086         jg         last_vs_1_2_loop
   1087     }
   1088 }
   1089 
   1090 /****************************************************************************
   1091  *
   1092  *  ROUTINE       : horizontal_line_1_2_scale
   1093  *
   1094  *  INPUTS        : const unsigned char *source :
   1095  *                  unsigned int source_width    :
   1096  *                  unsigned char *dest         :
   1097  *                  unsigned int dest_width      :
   1098  *
   1099  *  OUTPUTS       : None.
   1100  *
   1101  *  RETURNS       : void
   1102  *
   1103  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1104  *
   1105  *  SPECIAL NOTES : None.
   1106  *
   1107  ****************************************************************************/
   1108 static
   1109 void horizontal_line_1_2_scale_mmx
   1110 (
   1111     const unsigned char *source,
   1112     unsigned int source_width,
   1113     unsigned char *dest,
   1114     unsigned int dest_width
   1115 )
   1116 {
   1117     (void) dest_width;
   1118 
   1119     __asm
   1120     {
   1121         mov         esi,    source
   1122         mov         edi,    dest
   1123 
   1124         pxor        mm7,    mm7
   1125         movq        mm6,    four_ones
   1126 
   1127         mov         ecx,    source_width
   1128 
   1129         hs_1_2_loop:
   1130 
   1131         movq        mm0,    [esi]
   1132         movq        mm1,    [esi+1]
   1133 
   1134         movq        mm2,    mm0
   1135         movq        mm3,    mm1
   1136 
   1137         movq        mm4,    mm0
   1138         punpcklbw   mm0,    mm7
   1139 
   1140         punpcklbw   mm1,    mm7
   1141         paddw       mm0,    mm1
   1142 
   1143         paddw       mm0,    mm6
   1144         punpckhbw   mm2,    mm7
   1145 
   1146         punpckhbw   mm3,    mm7
   1147         paddw       mm2,    mm3
   1148 
   1149         paddw       mm2,    mm6
   1150         psraw       mm0,    1
   1151 
   1152         psraw       mm2,    1
   1153         packuswb    mm0,    mm2
   1154 
   1155         movq        mm2,    mm4
   1156         punpcklbw   mm2,    mm0
   1157 
   1158         movq        [edi],  mm2
   1159         punpckhbw   mm4,    mm0
   1160 
   1161         movq        [edi+8], mm4
   1162         add         esi,    8
   1163 
   1164         add         edi,    16
   1165         sub         ecx,    8
   1166 
   1167         cmp         ecx,    8
   1168         jg          hs_1_2_loop
   1169 
   1170 // last eight pixel
   1171 
   1172         movq        mm0,    [esi]
   1173         movq        mm1,    mm0
   1174 
   1175         movq        mm2,    mm0
   1176         movq        mm3,    mm1
   1177 
   1178         psrlq       mm1,    8
   1179         psrlq       mm3,    56
   1180 
   1181         psllq       mm3,    56
   1182         por         mm1,    mm3
   1183 
   1184         movq        mm3,    mm1
   1185         movq        mm4,    mm0
   1186 
   1187         punpcklbw   mm0,    mm7
   1188         punpcklbw   mm1,    mm7
   1189 
   1190         paddw       mm0,    mm1
   1191         paddw       mm0,    mm6
   1192 
   1193         punpckhbw   mm2,    mm7
   1194         punpckhbw   mm3,    mm7
   1195 
   1196         paddw       mm2,    mm3
   1197         paddw       mm2,    mm6
   1198 
   1199         psraw       mm0,    1
   1200         psraw       mm2,    1
   1201 
   1202         packuswb    mm0,    mm2
   1203         movq        mm2,    mm4
   1204 
   1205         punpcklbw   mm2,    mm0
   1206         movq        [edi],  mm2
   1207 
   1208         punpckhbw   mm4,    mm0
   1209         movq        [edi+8], mm4
   1210     }
   1211 }
   1212 
   1213 
   1214 
   1215 
   1216 
   1217 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
   1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
   1219 
   1220 
   1221 /****************************************************************************
   1222  *
   1223  *  ROUTINE       : horizontal_line_5_4_scale_mmx
   1224  *
   1225  *  INPUTS        : const unsigned char *source : Pointer to source data.
   1226  *                  unsigned int source_width    : Stride of source.
   1227  *                  unsigned char *dest         : Pointer to destination data.
   1228  *                  unsigned int dest_width      : Stride of destination (NOT USED).
   1229  *
   1230  *  OUTPUTS       : None.
   1231  *
   1232  *  RETURNS       : void
   1233  *
   1234  *  FUNCTION      : Copies horizontal line of pixels from source to
   1235  *                  destination scaling up by 4 to 5.
   1236  *
   1237  *  SPECIAL NOTES : None.
   1238  *
   1239  ****************************************************************************/
   1240 static
   1241 void horizontal_line_5_4_scale_mmx
   1242 (
   1243     const unsigned char *source,
   1244     unsigned int source_width,
   1245     unsigned char *dest,
   1246     unsigned int dest_width
   1247 )
   1248 {
   1249     /*
   1250     unsigned i;
   1251     unsigned int a, b, c, d, e;
   1252     unsigned char *des = dest;
   1253     const unsigned char *src = source;
   1254 
   1255     (void) dest_width;
   1256 
   1257     for ( i=0; i<source_width; i+=5 )
   1258     {
   1259         a = src[0];
   1260         b = src[1];
   1261         c = src[2];
   1262         d = src[3];
   1263         e = src[4];
   1264 
   1265         des[0] = a;
   1266         des[1] = ((b*192 + c* 64 + 128)>>8);
   1267         des[2] = ((c*128 + d*128 + 128)>>8);
   1268         des[3] = ((d* 64 + e*192 + 128)>>8);
   1269 
   1270         src += 5;
   1271         des += 4;
   1272     }
   1273     */
   1274     (void) dest_width;
   1275 
   1276     __asm
   1277     {
   1278 
   1279         mov         esi,        source              ;
   1280         mov         edi,        dest                ;
   1281 
   1282         mov         ecx,        source_width         ;
   1283         movq        mm5,        const54_1           ;
   1284 
   1285         pxor        mm7,        mm7                 ;
   1286         movq        mm6,        const54_2           ;
   1287 
   1288         movq        mm4,        round_values         ;
   1289         lea         edx,        [esi+ecx]           ;
   1290         horizontal_line_5_4_loop:
   1291 
   1292         movq        mm0,        QWORD PTR  [esi]    ;
   1293         00 01 02 03 04 05 06 07
   1294         movq        mm1,        mm0                 ;
   1295         00 01 02 03 04 05 06 07
   1296 
   1297         psrlq       mm0,        8                   ;
   1298         01 02 03 04 05 06 07 xx
   1299         punpcklbw   mm1,        mm7                 ;
   1300         xx 00 xx 01 xx 02 xx 03
   1301 
   1302         punpcklbw   mm0,        mm7                 ;
   1303         xx 01 xx 02 xx 03 xx 04
   1304         pmullw      mm1,        mm5
   1305 
   1306         pmullw      mm0,        mm6
   1307         add         esi,        5
   1308 
   1309         add         edi,        4
   1310         paddw       mm1,        mm0
   1311 
   1312         paddw       mm1,        mm4
   1313         psrlw       mm1,        8
   1314 
   1315         cmp         esi,        edx
   1316         packuswb    mm1,        mm7
   1317 
   1318         movd        DWORD PTR [edi-4], mm1
   1319 
   1320         jl          horizontal_line_5_4_loop
   1321 
   1322     }
   1323 
   1324 }
   1325 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
   1326 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
   1327 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
   1328 
   1329 static
   1330 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1331 {
   1332 
   1333     __asm
   1334     {
   1335         push        ebx
   1336 
   1337         mov         esi,    source                    // Get the source and destination pointer
   1338         mov         ecx,    src_pitch               // Get the pitch size
   1339 
   1340         mov         edi,    dest                    // tow lines below
   1341         pxor        mm7,    mm7                     // clear out mm7
   1342 
   1343         mov         edx,    dest_pitch               // Loop counter
   1344         mov         ebx,    dest_width
   1345 
   1346         vs_5_4_loop:
   1347 
   1348         movd        mm0,    DWORD ptr [esi]         // src[0];
   1349         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
   1350 
   1351         movd        mm2,    DWORD ptr [esi+ecx*2]
   1352         lea         eax,    [esi+ecx*2]             //
   1353 
   1354         punpcklbw   mm1,    mm7
   1355         punpcklbw   mm2,    mm7
   1356 
   1357         movq        mm3,    mm2
   1358         pmullw      mm1,    three_fourths
   1359 
   1360         pmullw      mm2,    one_fourths
   1361         movd        mm4,    [eax+ecx]
   1362 
   1363         pmullw      mm3,    two_fourths
   1364         punpcklbw   mm4,    mm7
   1365 
   1366         movq        mm5,    mm4
   1367         pmullw      mm4,    two_fourths
   1368 
   1369         paddw       mm1,    mm2
   1370         movd        mm6,    [eax+ecx*2]
   1371 
   1372         pmullw      mm5,    one_fourths
   1373         paddw       mm1,    round_values;
   1374 
   1375         paddw       mm3,    mm4
   1376         psrlw       mm1,    8
   1377 
   1378         punpcklbw   mm6,    mm7
   1379         paddw       mm3,    round_values
   1380 
   1381         pmullw      mm6,    three_fourths
   1382         psrlw       mm3,    8
   1383 
   1384         packuswb    mm1,    mm7
   1385         packuswb    mm3,    mm7
   1386 
   1387         movd        DWORD PTR [edi], mm0
   1388         movd        DWORD PTR [edi+edx], mm1
   1389 
   1390 
   1391         paddw       mm5,    mm6
   1392         movd        DWORD PTR [edi+edx*2], mm3
   1393 
   1394         lea         eax,    [edi+edx*2]
   1395         paddw       mm5,    round_values
   1396 
   1397         psrlw       mm5,    8
   1398         add         edi,    4
   1399 
   1400         packuswb    mm5,    mm7
   1401         movd        DWORD PTR [eax+edx], mm5
   1402 
   1403         add         esi,    4
   1404         sub         ebx,    4
   1405 
   1406         jg         vs_5_4_loop
   1407 
   1408         pop         ebx
   1409     }
   1410 }
   1411 
   1412 
   1413 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
   1414 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
   1415 
   1416 
   1417 static
   1418 void horizontal_line_5_3_scale_mmx
   1419 (
   1420     const unsigned char *source,
   1421     unsigned int source_width,
   1422     unsigned char *dest,
   1423     unsigned int dest_width
   1424 )
   1425 {
   1426 
   1427     (void) dest_width;
   1428     __asm
   1429     {
   1430 
   1431         mov         esi,        source              ;
   1432         mov         edi,        dest                ;
   1433 
   1434         mov         ecx,        source_width         ;
   1435         movq        mm5,        const53_1           ;
   1436 
   1437         pxor        mm7,        mm7                 ;
   1438         movq        mm6,        const53_2           ;
   1439 
   1440         movq        mm4,        round_values         ;
   1441         lea         edx,        [esi+ecx-5]         ;
   1442         horizontal_line_5_3_loop:
   1443 
   1444         movq        mm0,        QWORD PTR  [esi]    ;
   1445         00 01 02 03 04 05 06 07
   1446         movq        mm1,        mm0                 ;
   1447         00 01 02 03 04 05 06 07
   1448 
   1449         psllw       mm0,        8                   ;
   1450         xx 00 xx 02 xx 04 xx 06
   1451         psrlw       mm1,        8                   ;
   1452         01 xx 03 xx 05 xx 07 xx
   1453 
   1454         psrlw       mm0,        8                   ;
   1455         00 xx 02 xx 04 xx 06 xx
   1456         psllq       mm1,        16                  ;
   1457         xx xx 01 xx 03 xx 05 xx
   1458 
   1459         pmullw      mm0,        mm6
   1460 
   1461         pmullw      mm1,        mm5
   1462         add         esi,        5
   1463 
   1464         add         edi,        3
   1465         paddw       mm1,        mm0
   1466 
   1467         paddw       mm1,        mm4
   1468         psrlw       mm1,        8
   1469 
   1470         cmp         esi,        edx
   1471         packuswb    mm1,        mm7
   1472 
   1473         movd        DWORD PTR [edi-3], mm1
   1474         jl          horizontal_line_5_3_loop
   1475 
   1476 //exit condition
   1477         movq        mm0,        QWORD PTR  [esi]    ;
   1478         00 01 02 03 04 05 06 07
   1479         movq        mm1,        mm0                 ;
   1480         00 01 02 03 04 05 06 07
   1481 
   1482         psllw       mm0,        8                   ;
   1483         xx 00 xx 02 xx 04 xx 06
   1484         psrlw       mm1,        8                   ;
   1485         01 xx 03 xx 05 xx 07 xx
   1486 
   1487         psrlw       mm0,        8                   ;
   1488         00 xx 02 xx 04 xx 06 xx
   1489         psllq       mm1,        16                  ;
   1490         xx xx 01 xx 03 xx 05 xx
   1491 
   1492         pmullw      mm0,        mm6
   1493 
   1494         pmullw      mm1,        mm5
   1495         paddw       mm1,        mm0
   1496 
   1497         paddw       mm1,        mm4
   1498         psrlw       mm1,        8
   1499 
   1500         packuswb    mm1,        mm7
   1501         movd        eax,        mm1
   1502 
   1503         mov         edx,        eax
   1504         shr         edx,        16
   1505 
   1506         mov         WORD PTR[edi],   ax
   1507         mov         BYTE PTR[edi+2], dl
   1508 
   1509     }
   1510 
   1511 }
   1512 
   1513 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
   1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
   1515 
   1516 static
   1517 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1518 {
   1519 
   1520     __asm
   1521     {
   1522         push        ebx
   1523 
   1524         mov         esi,    source                    // Get the source and destination pointer
   1525         mov         ecx,    src_pitch               // Get the pitch size
   1526 
   1527         mov         edi,    dest                    // tow lines below
   1528         pxor        mm7,    mm7                     // clear out mm7
   1529 
   1530         mov         edx,    dest_pitch               // Loop counter
   1531         movq        mm5,    one_thirds
   1532 
   1533         movq        mm6,    two_thirds
   1534         mov         ebx,    dest_width;
   1535 
   1536         vs_5_3_loop:
   1537 
   1538         movd        mm0,    DWORD ptr [esi]         // src[0];
   1539         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
   1540 
   1541         movd        mm2,    DWORD ptr [esi+ecx*2]
   1542         lea         eax,    [esi+ecx*2]             //
   1543 
   1544         punpcklbw   mm1,    mm7
   1545         punpcklbw   mm2,    mm7
   1546 
   1547         pmullw      mm1,    mm5
   1548         pmullw      mm2,    mm6
   1549 
   1550         movd        mm3,    DWORD ptr [eax+ecx]
   1551         movd        mm4,    DWORD ptr [eax+ecx*2]
   1552 
   1553         punpcklbw   mm3,    mm7
   1554         punpcklbw   mm4,    mm7
   1555 
   1556         pmullw      mm3,    mm6
   1557         pmullw      mm4,    mm5
   1558 
   1559 
   1560         movd        DWORD PTR [edi], mm0
   1561         paddw       mm1,    mm2
   1562 
   1563         paddw       mm1,    round_values
   1564         psrlw       mm1,    8
   1565 
   1566         packuswb    mm1,    mm7
   1567         paddw       mm3,    mm4
   1568 
   1569         paddw       mm3,    round_values
   1570         movd        DWORD PTR [edi+edx], mm1
   1571 
   1572         psrlw       mm3,    8
   1573         packuswb    mm3,    mm7
   1574 
   1575         movd        DWORD PTR [edi+edx*2], mm3
   1576 
   1577 
   1578         add         edi,    4
   1579         add         esi,    4
   1580 
   1581         sub         ebx,    4
   1582         jg          vs_5_3_loop
   1583 
   1584         pop         ebx
   1585     }
   1586 }
   1587 
   1588 
   1589 
   1590 
   1591 /****************************************************************************
   1592  *
   1593  *  ROUTINE       : horizontal_line_2_1_scale
   1594  *
   1595  *  INPUTS        : const unsigned char *source :
   1596  *                  unsigned int source_width    :
   1597  *                  unsigned char *dest         :
   1598  *                  unsigned int dest_width      :
   1599  *
   1600  *  OUTPUTS       : None.
   1601  *
   1602  *  RETURNS       : void
   1603  *
   1604  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1605  *
   1606  *  SPECIAL NOTES : None.
   1607  *
   1608  ****************************************************************************/
   1609 static
   1610 void horizontal_line_2_1_scale_mmx
   1611 (
   1612     const unsigned char *source,
   1613     unsigned int source_width,
   1614     unsigned char *dest,
   1615     unsigned int dest_width
   1616 )
   1617 {
   1618     (void) dest_width;
   1619     (void) source_width;
   1620     __asm
   1621     {
   1622         mov         esi,    source
   1623         mov         edi,    dest
   1624 
   1625         pxor        mm7,    mm7
   1626         mov         ecx,    dest_width
   1627 
   1628         xor         edx,    edx
   1629         hs_2_1_loop:
   1630 
   1631         movq        mm0,    [esi+edx*2]
   1632         psllw       mm0,    8
   1633 
   1634         psrlw       mm0,    8
   1635         packuswb    mm0,    mm7
   1636 
   1637         movd        DWORD Ptr [edi+edx], mm0;
   1638         add         edx,    4
   1639 
   1640         cmp         edx,    ecx
   1641         jl          hs_2_1_loop
   1642 
   1643     }
   1644 }
   1645 
   1646 
   1647 
   1648 static
   1649 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1650 {
   1651     (void) dest_pitch;
   1652     (void) src_pitch;
   1653     vpx_memcpy(dest, source, dest_width);
   1654 }
   1655 
   1656 
   1657 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
   1658 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
   1659 
   1660 static
   1661 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1662 {
   1663 
   1664     (void) dest_pitch;
   1665     __asm
   1666     {
   1667         mov         esi,        source
   1668         mov         edi,        dest
   1669 
   1670         mov         eax,        src_pitch
   1671         mov         edx,        dest_width
   1672 
   1673         pxor        mm7,        mm7
   1674         sub         esi,        eax             //back one line
   1675 
   1676 
   1677         lea         ecx,        [esi+edx];
   1678         movq        mm6,        round_values;
   1679 
   1680         movq        mm5,        three_sixteenths;
   1681         movq        mm4,        ten_sixteenths;
   1682 
   1683         vs_2_1_i_loop:
   1684         movd        mm0,        [esi]           //
   1685         movd        mm1,        [esi+eax]       //
   1686 
   1687         movd        mm2,        [esi+eax*2]     //
   1688         punpcklbw   mm0,        mm7
   1689 
   1690         pmullw      mm0,        mm5
   1691         punpcklbw   mm1,        mm7
   1692 
   1693         pmullw      mm1,        mm4
   1694         punpcklbw   mm2,        mm7
   1695 
   1696         pmullw      mm2,        mm5
   1697         paddw       mm0,        round_values
   1698 
   1699         paddw       mm1,        mm2
   1700         paddw       mm0,        mm1
   1701 
   1702         psrlw       mm0,        8
   1703         packuswb    mm0,        mm7
   1704 
   1705         movd        DWORD PTR [edi],        mm0
   1706         add         esi,        4
   1707 
   1708         add         edi,        4;
   1709         cmp         esi,        ecx
   1710         jl          vs_2_1_i_loop
   1711 
   1712     }
   1713 }
   1714 
   1715 
   1716 
   1717 void
   1718 register_mmxscalers(void)
   1719 {
   1720     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
   1721     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
   1722     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
   1723     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
   1724     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
   1725     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
   1726     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
   1727     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
   1728     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
   1729 
   1730     vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
   1731     vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
   1732     vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
   1733     vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
   1734     vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
   1735     vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
   1736 
   1737 
   1738 
   1739     vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
   1740     vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
   1741     vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
   1742     vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
   1743     vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
   1744     vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
   1745     vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
   1746 
   1747 
   1748 
   1749 
   1750 }
   1751