vpx_scale/intel_linux/scaleopt.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


/****************************************************************************
*
*   Module Title :     scaleopt.cpp
*
*   Description  :     Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"

/****************************************************************************
*  Module Statics
****************************************************************************/
#if 0
__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
#endif

#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_3_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
    __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

    (void) dest_width;

    __asm
    {

        push ebx

        mov         esi,    source
        mov         edi,    dest

        mov         ecx,    source_width
        lea         edx,    [esi+ecx-3];

        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx

        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

        horiz_line_3_5_loop:

        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov        ebx,    eax

        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx

        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx

        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

        mov         [edi],  ebx             // writeoutput 00 xx xx xx
        add         esi,    3

        add         edi,    5
        paddw       mm0,    mm1

        paddw       mm0,    mm4
        psrlw       mm0,    8

        cmp         esi,    edx
        packuswb    mm0,    mm7

        movd        DWORD Ptr [edi-4], mm0
        jl          horiz_line_3_5_loop

//Exit:
        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
        mov         ebx,    eax

        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

        shl         eax,    8               // eax = xx 01 02 02
        and         eax,    0xffff0000      // eax = xx xx 02 02

        or          eax,    ebx             // eax = 01 02 02 02

        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx

        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx

        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

        mov         [edi],  ebx             // writeoutput 00 xx xx xx
        paddw       mm0,    mm1

        paddw       mm0,    mm4
        psrlw       mm0,    8

        packuswb    mm0,    mm7
        movd        DWORD Ptr [edi+1], mm0

        pop ebx

    }

    /*
    const unsigned char *src = source;
    unsigned char *des = dest;
    unsigned int a, b, c ;
    unsigned int i;
    (void) dest_width;

    for ( i=0; i<source_width-3; i+=3 )
    {
        a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        // 2 * left + 3 * right /5
        des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
        c = src[2] ;
        // 4 * left + 1 * right /5
        des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
        // 1 * left + 4 * right /5
        des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);

        a = src[3];
        // 3 * left + 2 * right /5
        des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);

        src += 3;
        des += 5;
    }

    a = src[0];
    b = src[1];
    des [0] = (UINT8) (a);
    // 2 * left + 3 * right /5
    des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
    c = src[2] ;
    // 4 * left + 1 * right /5
    des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
    // 1 * left + 4 * right /5
    des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);

    des [4] = (UINT8) (c);
    */
}


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_4_5_scale_mmx
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
    __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
    __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};

    (void)dest_width;

    __asm
    {

        mov         esi,    source
        mov         edi,    dest

        mov         ecx,    source_width
        lea         edx,    [esi+ecx-8];

        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx

        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

        horiz_line_4_5_loop:

        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08

        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08

        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51

        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7

        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        add         edi,    10

        add         esi,    8
        paddw       mm2,    mm3             //

        paddw       mm2,    mm4             // added round values
        cmp         esi,    edx

        psrlw       mm2,    8
        packuswb    mm2,    mm7

        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
        jl         horiz_line_4_5_loop

//Exit:
        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07

        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00

        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00

        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07

        movq        mm3,    mm1

        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51

        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx

        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
        paddw       mm2,    mm3             //

        paddw       mm2,    mm4             // added round values
        psrlw       mm2,    8

        packuswb    mm2,    mm7
        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09


    }
    /*
        const unsigned char *src = source;
        unsigned char *des = dest;
        unsigned int a, b, c ;
        unsigned i;
        (void) dest_width;

        for ( i=0; i<source_width-4; i+=4 )
        {
            a = src[0];
            b = src[1];
            des [0] = (UINT8) a;
            des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
            c = src[2] * 154;
            a = src[3];
            des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
            des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
            b = src[4];
            des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);

            src += 4;
            des += 5;
        }

        a = src[0];
        b = src[1];
        des [0] = (UINT8) (a);
        des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
        c = src[2] * 154;
        a = src[3];
        des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
        des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
        des [4] = (UINT8) (a);
    */
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has a "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{

    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

    __asm
    {

        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_4_5_loop:

        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 1/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy

        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths


        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5

        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2

        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]

        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        movq        mm1,    [edi]                   // mm1=Src[3];

        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking

        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5

        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5

        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy

        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5

        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5

        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128

        psrlw       mm0,    8
        psrlw       mm2,    8

        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [edi], mm0            // write des[3]

        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking

        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group

        movq        mm5,    four_fifths              // mm5 = 4/5
        pmullw      mm1,    mm5                     // d * 4/5

        movq        mm6,    one_fifth                // mm6 = 1/5
        movq        mm2,    mm0                     // make a copy

        pmullw      mm3,    mm5                     // d * 4/5
        punpcklbw   mm0,    mm7                     // unpack low

        pmullw      mm0,    mm6                     // an * 1/5
        punpckhbw   mm2,    mm7                     // unpack high

        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
        pmullw      mm2,    mm6                     // an * 1/5

        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[4]

        movq        QWORD ptr [edi+ecx], mm1        // write des[4]

        add         edi,    8
        add         esi,    8

        sub         edx,    8
        jg         vs_4_5_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : None
 *
 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        last_vs_4_5_loop:

        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 1/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy

        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths


        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5

        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2

        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]

        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
        movq        mm1,    [edi]                   // mm1=Src[3];

        movq        QWORD ptr [edi+ecx], mm1        // write des[4];

        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking

        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5

        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5

        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy

        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5

        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5

        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128

        psrlw       mm0,    8
        psrlw       mm2,    8

        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [edi], mm0            // write des[3]

        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking
        add         edi,    8
        add         esi,    8

        sub         edx,    8
        jg          last_vs_4_5_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };

    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_3_5_loop:

        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 2/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low

        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high

        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5

        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5

        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high

        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low

        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    four_fifths              // c * 4/5 high

        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128

        psrlw       mm1,    8
        psrlw       mm3,    8

        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]

        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128

        psrlw       mm4,    8
        psrlw       mm5,    8

        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [edi], mm4            // write des[3]

        //  mm0, mm2 --- Src[3]

        pxor        mm7,    mm7                     // clear mm7 for unpacking
        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group

        movq        mm5,    three_fifths             // mm5 = 3/5
        pmullw      mm0,    mm5                     // d * 3/5

        movq        mm6,    two_fifths                // mm6 = 2/5
        movq        mm3,    mm1                     // make a copy

        pmullw      mm2,    mm5                     // d * 3/5
        punpcklbw   mm1,    mm7                     // unpack low

        pmullw      mm1,    mm6                     // an * 2/5
        punpckhbw   mm3,    mm7                     // unpack high

        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
        pmullw      mm3,    mm6                     // an * 2/5

        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des[4]

        movq        QWORD ptr [edi+ecx], mm0        // write des[4]

        add         edi,    8
        add         esi,    8

        sub         edx,    8
        jg          vs_3_5_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         edi,    [esi+ecx*2]             // tow lines below
        add         edi,    ecx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter


        last_vs_3_5_loop:

        movq        mm0,    QWORD ptr [esi]         // src[0];
        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 2/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]


        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low

        movq        QWORD ptr [edi+ecx], mm0        // write des[4]

        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high

        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5

        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5

        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high

        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low

        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    four_fifths              // c * 4/5 high

        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128

        psrlw       mm1,    8
        psrlw       mm3,    8

        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]

        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128

        psrlw       mm4,    8
        psrlw       mm5,    8

        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [edi], mm4            // write des[3]

        //  mm0, mm2 --- Src[3]

        add         edi,    8
        add         esi,    8

        sub         edx,    8
        jg          last_vs_3_5_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};

    __asm
    {

        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_1_2_loop:

        movq        mm0,    [esi]                   // get Src[0]
        movq        mm1,    [esi + ecx * 2]         // get Src[1]

        movq        mm2,    mm0                     // make copy before unpack
        movq        mm3,    mm1                     // make copy before unpack

        punpcklbw   mm0,    mm7                     // low Src[0]
        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1

        punpcklbw   mm1,    mm7                     // low Src[1]
        paddw       mm0,    mm1                     // low (a + b)

        punpckhbw   mm2,    mm7                     // high Src[0]
        paddw       mm0,    mm6                     // low (a + b + 1)

        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3                     // high (a + b )

        psraw       mm0,    1                       // low (a + b +1 )/2
        paddw       mm2,    mm6                     // high (a + b + 1)

        psraw       mm2,    1                       // high (a + b + 1)/2
        packuswb    mm0,    mm2                     // pack results

        movq        [esi+ecx], mm0                  // write out eight bytes
        add         esi,    8

        sub         edx,    8
        jg          vs_1_2_loop
    }

}

/****************************************************************************
 *
 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
 *
 *  INPUTS        : unsigned char *dest    :
 *                  unsigned int dest_pitch :
 *                  unsigned int dest_width :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
 *
 *  SPECIAL NOTES : The routine uses the first line of the band below
 *                  the current band. The function also has an "C" only
 *                  version.
 *
 ****************************************************************************/
static
void last_vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         esi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        mov         edx,    dest_width               // Loop counter

        last_vs_1_2_loop:

        movq        mm0,    [esi]                   // get Src[0]
        movq        [esi+ecx], mm0                  // write out eight bytes

        add         esi,    8
        sub         edx,    8

        jg         last_vs_1_2_loop
    }
}

/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_1_2_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_1_2_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};

    (void) dest_width;

    __asm
    {
        mov         esi,    source
        mov         edi,    dest

        pxor        mm7,    mm7
        movq        mm6,    four_ones

        mov         ecx,    source_width

        hs_1_2_loop:

        movq        mm0,    [esi]
        movq        mm1,    [esi+1]

        movq        mm2,    mm0
        movq        mm3,    mm1

        movq        mm4,    mm0
        punpcklbw   mm0,    mm7

        punpcklbw   mm1,    mm7
        paddw       mm0,    mm1

        paddw       mm0,    mm6
        punpckhbw   mm2,    mm7

        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3

        paddw       mm2,    mm6
        psraw       mm0,    1

        psraw       mm2,    1
        packuswb    mm0,    mm2

        movq        mm2,    mm4
        punpcklbw   mm2,    mm0

        movq        [edi],  mm2
        punpckhbw   mm4,    mm0

        movq        [edi+8], mm4
        add         esi,    8

        add         edi,    16
        sub         ecx,    8

        cmp         ecx,    8
        jg          hs_1_2_loop

// last eight pixel

        movq        mm0,    [esi]
        movq        mm1,    mm0

        movq        mm2,    mm0
        movq        mm3,    mm1

        psrlq       mm1,    8
        psrlq       mm3,    56

        psllq       mm3,    56
        por         mm1,    mm3

        movq        mm3,    mm1
        movq        mm4,    mm0

        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7

        paddw       mm0,    mm1
        paddw       mm0,    mm6

        punpckhbw   mm2,    mm7
        punpckhbw   mm3,    mm7

        paddw       mm2,    mm3
        paddw       mm2,    mm6

        psraw       mm0,    1
        psraw       mm2,    1

        packuswb    mm0,    mm2
        movq        mm2,    mm4

        punpcklbw   mm2,    mm0
        movq        [edi],  mm2

        punpckhbw   mm4,    mm0
        movq        [edi+8], mm4
    }
}


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_5_4_scale_mmx
 *
 *  INPUTS        : const unsigned char *source : Pointer to source data.
 *                  unsigned int source_width    : Stride of source.
 *                  unsigned char *dest         : Pointer to destination data.
 *                  unsigned int dest_width      : Stride of destination (NOT USED).
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : Copies horizontal line of pixels from source to
 *                  destination scaling up by 4 to 5.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_5_4_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{

    __declspec(align(16)) const unsigned short const54_2[] = {  0,  64, 128, 192 };
    __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128,  64 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    /*
    unsigned i;
    unsigned int a, b, c, d, e;
    unsigned char *des = dest;
    const unsigned char *src = source;

    (void) dest_width;

    for ( i=0; i<source_width; i+=5 )
    {
        a = src[0];
        b = src[1];
        c = src[2];
        d = src[3];
        e = src[4];

        des[0] = a;
        des[1] = ((b*192 + c* 64 + 128)>>8);
        des[2] = ((c*128 + d*128 + 128)>>8);
        des[3] = ((d* 64 + e*192 + 128)>>8);

        src += 5;
        des += 4;
    }
    */
    __asm
    {

        mov         esi,        source              ;
        mov         edi,        dest                ;

        mov         ecx,        source_width         ;
        movq        mm5,        const54_1           ;

        pxor        mm7,        mm7                 ;
        movq        mm6,        const54_2           ;

        movq        mm4,        round_values         ;
        lea         edx,        [esi+ecx]           ;
        horizontal_line_5_4_loop:

        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psrlq       mm0,        8                   ;
        01 02 03 04 05 06 07 xx
        punpcklbw   mm1,        mm7                 ;
        xx 00 xx 01 xx 02 xx 03

        punpcklbw   mm0,        mm7                 ;
        xx 01 xx 02 xx 03 xx 04
        pmullw      mm1,        mm5

        pmullw      mm0,        mm6
        add         esi,        5

        add         edi,        4
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        cmp         esi,        edx
        packuswb    mm1,        mm7

        movd        DWORD PTR [edi-4], mm1

        jl          horizontal_line_5_4_loop

    }

}

static
void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{

    __declspec(align(16)) const unsigned short one_fourths[]   = {  64,  64,  64, 64  };
    __declspec(align(16)) const unsigned short two_fourths[]   = { 128, 128, 128, 128 };
    __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };

    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        push        ebx

        mov         esi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size

        mov         edi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7

        mov         edx,    dest_pitch               // Loop counter
        mov         ebx,    dest_width

        vs_5_4_loop:

        movd        mm0,    DWORD ptr [esi]         // src[0];
        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];

        movd        mm2,    DWORD ptr [esi+ecx*2]
        lea         eax,    [esi+ecx*2]             //

        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7

        movq        mm3,    mm2
        pmullw      mm1,    three_fourths

        pmullw      mm2,    one_fourths
        movd        mm4,    [eax+ecx]

        pmullw      mm3,    two_fourths
        punpcklbw   mm4,    mm7

        movq        mm5,    mm4
        pmullw      mm4,    two_fourths

        paddw       mm1,    mm2
        movd        mm6,    [eax+ecx*2]

        pmullw      mm5,    one_fourths
        paddw       mm1,    round_values;

        paddw       mm3,    mm4
        psrlw       mm1,    8

        punpcklbw   mm6,    mm7
        paddw       mm3,    round_values

        pmullw      mm6,    three_fourths
        psrlw       mm3,    8

        packuswb    mm1,    mm7
        packuswb    mm3,    mm7

        movd        DWORD PTR [edi], mm0
        movd        DWORD PTR [edi+edx], mm1


        paddw       mm5,    mm6
        movd        DWORD PTR [edi+edx*2], mm3

        lea         eax,    [edi+edx*2]
        paddw       mm5,    round_values

        psrlw       mm5,    8
        add         edi,    4

        packuswb    mm5,    mm7
        movd        DWORD PTR [eax+edx], mm5

        add         esi,    4
        sub         ebx,    4

        jg         vs_5_4_loop

        pop         ebx
    }
}


static
void horizontal_line_5_3_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __declspec(align(16)) const unsigned short const53_1[] = {  0,  85, 171, 0 };
    __declspec(align(16)) const unsigned short const53_2[] = {256, 171,  85, 0 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {

        mov         esi,        source              ;
        mov         edi,        dest                ;

        mov         ecx,        source_width         ;
        movq        mm5,        const53_1           ;

        pxor        mm7,        mm7                 ;
        movq        mm6,        const53_2           ;

        movq        mm4,        round_values         ;
        lea         edx,        [esi+ecx-5]         ;
        horizontal_line_5_3_loop:

        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx

        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx

        pmullw      mm0,        mm6

        pmullw      mm1,        mm5
        add         esi,        5

        add         edi,        3
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        cmp         esi,        edx
        packuswb    mm1,        mm7

        movd        DWORD PTR [edi-3], mm1
        jl          horizontal_line_5_3_loop

//exit condition
        movq        mm0,        QWORD PTR  [esi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx

        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx

        pmullw      mm0,        mm6

        pmullw      mm1,        mm5
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        packuswb    mm1,        mm7
        movd        eax,        mm1

        mov         edx,        eax
        shr         edx,        16

        mov         WORD PTR[edi],   ax
        mov         BYTE PTR[edi+2], dl

    }

}


static
void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __declspec(align(16)) const unsigned short one_thirds[] = {  85,  85,  85,  85 };
    __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };

    __asm
    {
        push        ebx

        mov         esi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size

        mov         edi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7

        mov         edx,    dest_pitch               // Loop counter
        movq        mm5,    one_thirds

        movq        mm6,    two_thirds
        mov         ebx,    dest_width;

        vs_5_3_loop:

        movd        mm0,    DWORD ptr [esi]         // src[0];
        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];

        movd        mm2,    DWORD ptr [esi+ecx*2]
        lea         eax,    [esi+ecx*2]             //

        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7

        pmullw      mm1,    mm5
        pmullw      mm2,    mm6

        movd        mm3,    DWORD ptr [eax+ecx]
        movd        mm4,    DWORD ptr [eax+ecx*2]

        punpcklbw   mm3,    mm7
        punpcklbw   mm4,    mm7

        pmullw      mm3,    mm6
        pmullw      mm4,    mm5


        movd        DWORD PTR [edi], mm0
        paddw       mm1,    mm2

        paddw       mm1,    round_values
        psrlw       mm1,    8

        packuswb    mm1,    mm7
        paddw       mm3,    mm4

        paddw       mm3,    round_values
        movd        DWORD PTR [edi+edx], mm1

        psrlw       mm3,    8
        packuswb    mm3,    mm7

        movd        DWORD PTR [edi+edx*2], mm3


        add         edi,    4
        add         esi,    4

        sub         ebx,    4
        jg          vs_5_3_loop

        pop         ebx
    }
}


/****************************************************************************
 *
 *  ROUTINE       : horizontal_line_2_1_scale
 *
 *  INPUTS        : const unsigned char *source :
 *                  unsigned int source_width    :
 *                  unsigned char *dest         :
 *                  unsigned int dest_width      :
 *
 *  OUTPUTS       : None.
 *
 *  RETURNS       : void
 *
 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
 *
 *  SPECIAL NOTES : None.
 *
 ****************************************************************************/
static
void horizontal_line_2_1_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void) dest_width;

    __asm
    {
        mov         esi,    source
        mov         edi,    dest

        pxor        mm7,    mm7
        mov         ecx,    dest_width

        xor         edx,    edx
        hs_2_1_loop:

        movq        mm0,    [esi+edx*2]
        psllw       mm0,    8

        psrlw       mm0,    8
        packuswb    mm0,    mm7

        movd        DWORD Ptr [edi+edx], mm0;
        add         edx,    4

        cmp         edx,    ecx
        jl          hs_2_1_loop

    }
}


static
void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{
    vpx_memcpy(dest, source, dest_width);
}


static
void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
{

    __declspec(align(16)) const unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
    __declspec(align(16)) const unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
    __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    __asm
    {
        mov         esi,        source
        mov         edi,        dest

        mov         eax,        src_pitch
        mov         edx,        dest_width

        pxor        mm7,        mm7
        sub         esi,        eax             //back one line


        lea         ecx,        [esi+edx];
        movq        mm6,        round_values;

        movq        mm5,        three_sixteenths;
        movq        mm4,        ten_sixteenths;

        vs_2_1_i_loop:
        movd        mm0,        [esi]           //
        movd        mm1,        [esi+eax]       //

        movd        mm2,        [esi+eax*2]     //
        punpcklbw   mm0,        mm7

        pmullw      mm0,        mm5
        punpcklbw   mm1,        mm7

        pmullw      mm1,        mm4
        punpcklbw   mm2,        mm7

        pmullw      mm2,        mm5
        paddw       mm0,        round_values

        paddw       mm1,        mm2
        paddw       mm0,        mm1

        psrlw       mm0,        8
        packuswb    mm0,        mm7

        movd        DWORD PTR [edi],        mm0
        add         esi,        4

        add         edi,        4;
        cmp         esi,        ecx
        jl          vs_2_1_i_loop

    }
}

void
register_mmxscalers(void)
{
    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;

    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;


    vp8_vertical_band_5_4_scale          = vertical_band_5_4_scale_mmx;
    vp8_vertical_band_5_3_scale          = vertical_band_5_3_scale_mmx;
    vp8_vertical_band_2_1_scale          = vertical_band_2_1_scale_mmx;
    vp8_vertical_band_2_1_scale_i        = vertical_band_2_1_scale_i_mmx;
    vp8_horizontal_line_2_1_scale        = horizontal_line_2_1_scale_mmx;
    vp8_horizontal_line_5_3_scale        = horizontal_line_5_3_scale_mmx;
    vp8_horizontal_line_5_4_scale        = horizontal_line_5_4_scale_mmx;

}