vpx_scale/x86_64/scaleopt.c

/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */


/****************************************************************************
*
*   Module Title :     scaleopt.cpp
*
*   Description  :     Optimized scaling functions
*
****************************************************************************/
#include "pragmas.h"


/****************************************************************************
*  Module Statics
****************************************************************************/
__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };


#include "vpx_scale/vpxscale.h"
#include "vpx_mem/vpx_mem.h"

/****************************************************************************
*
*  ROUTINE       : horizontal_line_3_5_scale_mmx
*
*  INPUTS        : const unsigned char *source :
*                  unsigned int source_width    :
*                  unsigned char *dest         :
*                  unsigned int dest_width      :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
*
*  SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_3_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void) dest_width;

    __asm
    {

        push        rbx

        mov         rsi,    source
        mov         rdi,    dest

        mov         ecx,    source_width
        lea         rdx,    [rsi+rcx-3];

        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx

        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

        horiz_line_3_5_loop:

        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
        mov         ebx,    eax

        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx

        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx

        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
        add         rsi,    3

        add         rdi,    5
        paddw       mm0,    mm1

        paddw       mm0,    mm4
        psrlw       mm0,    8

        cmp         rsi,    rdx
        packuswb    mm0,    mm7

        movd        DWORD Ptr [rdi-4], mm0
        jl          horiz_line_3_5_loop

//Exit:
        mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
        mov         ebx,    eax

        and         ebx,    0xffff00        // ebx = xx 01 02 xx
        mov         ecx,    eax             // ecx = 00 01 02 03

        and         eax,    0xffff0000      // eax = xx xx 02 03
        xor         ecx,    eax             // ecx = 00 01 xx xx

        shr         ebx,    8               // ebx = 01 02 xx xx
        or          eax,    ebx             // eax = 01 02 02 03

        shl         eax,    8               // eax = xx 01 02 02
        and         eax,    0xffff0000      // eax = xx xx 02 02

        or          eax,    ebx             // eax = 01 02 02 02

        shl         ebx,    16              // ebx = xx xx 01 02
        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx

        or          ebx,    ecx             // ebx = 00 01 01 02
        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx

        movd        mm0,    ebx             // mm0 = 00 01 01 02
        pmullw      mm1,    mm6             //

        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
        pmullw      mm0,    mm5             //

        mov         [rdi],  ebx             // writeoutput 00 xx xx xx
        paddw       mm0,    mm1

        paddw       mm0,    mm4
        psrlw       mm0,    8

        packuswb    mm0,    mm7
        movd        DWORD Ptr [rdi+1], mm0

        pop rbx

    }

}


/****************************************************************************
*
*  ROUTINE       : horizontal_line_4_5_scale_mmx
*
*  INPUTS        : const unsigned char *source :
*                  unsigned int source_width    :
*                  unsigned char *dest         :
*                  unsigned int dest_width      :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
*
*  SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_4_5_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void)dest_width;

    __asm
    {

        mov         rsi,    source
        mov         rdi,    dest

        mov         ecx,    source_width
        lea         rdx,    [rsi+rcx-8];

        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx

        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
        pxor        mm7,    mm7             // clear mm7

        horiz_line_4_5_loop:

        movq        mm0,    QWORD PTR [rsi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    QWORD PTR [rsi+1];        // mm1 = 01 02 03 04 05 06 07 08

        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08

        movd        DWORD PTR [rdi],  mm0             // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

        movd        DWORD PTR [rdi+5], mm2            // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51

        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7

        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
        add         rdi,    10

        add         rsi,    8
        paddw       mm2,    mm3             //

        paddw       mm2,    mm4             // added round values
        cmp         rsi,    rdx

        psrlw       mm2,    8
        packuswb    mm2,    mm7

        movd        DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
        jl         horiz_line_4_5_loop

//Exit:
        movq        mm0,    [rsi]           // mm0 = 00 01 02 03 04 05 06 07
        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07

        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00

        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00

        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07

        movq        mm3,    mm1

        movd        DWORD PTR [rdi],  mm0   // write output 00 xx xx xx
        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx

        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205

        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx

        movd        DWORD PTR [rdi+5], mm2  // write ouput 05 xx xx xx
        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205

        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51

        paddw       mm0,    mm1             // added round values
        paddw       mm0,    mm4

        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx

        movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
        paddw       mm2,    mm3             //

        paddw       mm2,    mm4             // added round values
        psrlw       mm2,    8

        packuswb    mm2,    mm7
        movd        DWORD PTR [rdi+6], mm2  // writeoutput 06 07 08 09


    }
}

/****************************************************************************
*
*  ROUTINE       : vertical_band_4_5_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has a "C" only
*                  version.
*
****************************************************************************/
static
void vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {

        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         rdi,    [rsi+rcx*2]             // tow lines below
        add         rdi,    rcx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_4_5_loop:

        movq        mm0,    QWORD ptr [rsi]         // src[0];
        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 1/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy

        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths


        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5

        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2

        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]

        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
        movq        mm1,    [rdi]                   // mm1=Src[3];

        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking

        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5

        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5

        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy

        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5

        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5

        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128

        psrlw       mm0,    8
        psrlw       mm2,    8

        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [rdi], mm0            // write des[3]

        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking

        movq        mm0,    [rdi+rcx*2]             // mm0, Src[0] of the next group

        movq        mm5,    four_fifths              // mm5 = 4/5
        pmullw      mm1,    mm5                     // d * 4/5

        movq        mm6,    one_fifth                // mm6 = 1/5
        movq        mm2,    mm0                     // make a copy

        pmullw      mm3,    mm5                     // d * 4/5
        punpcklbw   mm0,    mm7                     // unpack low

        pmullw      mm0,    mm6                     // an * 1/5
        punpckhbw   mm2,    mm7                     // unpack high

        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
        pmullw      mm2,    mm6                     // an * 1/5

        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[4]

        movq        QWORD ptr [rdi+rcx], mm1        // write des[4]

        add         rdi,    8
        add         rsi,    8

        sub         rdx,    8
        jg          vs_4_5_loop
    }
}

/****************************************************************************
*
*  ROUTINE       : last_vertical_band_4_5_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : None
*
*  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has an "C" only
*                  version.
*
****************************************************************************/
static
void last_vertical_band_4_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         rdi,    [rsi+rcx*2]             // tow lines below
        add         rdi,    rcx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        last_vs_4_5_loop:

        movq        mm0,    QWORD ptr [rsi]         // src[0];
        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    one_fifth
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 1/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 1/5
        movq        mm6,    four_fifths               // constan

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 4/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 4/5
        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5

        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm5,    two_fifths
        movq        mm2,    mm0                     // make a copy

        pmullw      mm1,    mm5                     // b * 2/5
        movq        mm6,    three_fifths


        punpcklbw   mm0,    mm7                     // unpack low to word
        pmullw      mm3,    mm5                     // b * 2/5

        movq        mm4,    mm0                     // make copy of c
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm4,    mm6                     // c * 3/5
        movq        mm5,    mm2

        pmullw      mm5,    mm6                     // c * 3/5
        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5

        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
        paddw       mm1,    round_values             // + 128

        paddw       mm3,    round_values             // + 128
        psrlw       mm1,    8

        psrlw       mm3,    8
        packuswb    mm1,    mm3                     // des[2]

        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
        movq        mm1,    [rdi]                   // mm1=Src[3];

        movq        QWORD ptr [rdi+rcx], mm1        // write des[4];

        // mm0, mm2 --- Src[2]
        // mm1 --- Src[3]
        // mm6 --- 3/5
        // mm7 for unpacking

        pmullw      mm0,    mm6                     // c * 3/5
        movq        mm5,    two_fifths               // mm5 = 2/5

        movq        mm3,    mm1                     // make a copy
        pmullw      mm2,    mm6                     // c * 3/5

        punpcklbw   mm1,    mm7                     // unpack low
        movq        mm4,    mm1                     // make a copy

        punpckhbw   mm3,    mm7                     // unpack high
        pmullw      mm4,    mm5                     // d * 2/5

        movq        mm6,    mm3                     // make a copy
        pmullw      mm6,    mm5                     // d * 2/5

        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5

        paddw       mm0,    round_values             // + 128
        paddw       mm2,    round_values             // + 128

        psrlw       mm0,    8
        psrlw       mm2,    8

        packuswb    mm0,    mm2                     // des[3]
        movq        QWORD ptr [rdi], mm0            // write des[3]

        //  mm1, mm3 --- Src[3]
        //  mm7 -- cleared for unpacking
        add         rdi,    8
        add         rsi,    8

        sub         rdx,    8
        jg          last_vs_4_5_loop
    }
}

/****************************************************************************
*
*  ROUTINE       : vertical_band_3_5_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has an "C" only
*                  version.
*
****************************************************************************/
static
void vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         rdi,    [rsi+rcx*2]             // two lines below
        add         rdi,    rcx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_3_5_loop:

        movq        mm0,    QWORD ptr [rsi]         // src[0];
        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 2/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]

        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low

        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high

        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5

        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5

        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high

        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low

        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    four_fifths              // c * 4/5 high

        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128

        psrlw       mm1,    8
        psrlw       mm3,    8

        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]

        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128

        psrlw       mm4,    8
        psrlw       mm5,    8

        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [rdi], mm4            // write des[3]

        //  mm0, mm2 --- Src[3]

        pxor        mm7,    mm7                     // clear mm7 for unpacking
        movq        mm1,    [rdi+rcx*2]             // mm1 = Src[0] of the next group

        movq        mm5,    three_fifths             // mm5 = 3/5
        pmullw      mm0,    mm5                     // d * 3/5

        movq        mm6,    two_fifths                // mm6 = 2/5
        movq        mm3,    mm1                     // make a copy

        pmullw      mm2,    mm5                     // d * 3/5
        punpcklbw   mm1,    mm7                     // unpack low

        pmullw      mm1,    mm6                     // an * 2/5
        punpckhbw   mm3,    mm7                     // unpack high

        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
        pmullw      mm3,    mm6                     // an * 2/5

        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des[4]

        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]

        add         rdi,    8
        add         rsi,    8

        sub         rdx,    8
        jg          vs_3_5_loop
    }
}

/****************************************************************************
*
*  ROUTINE       : last_vertical_band_3_5_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has an "C" only
*                  version.
*
****************************************************************************/
static
void last_vertical_band_3_5_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        lea         rdi,    [rsi+rcx*2]             // tow lines below
        add         rdi,    rcx                     // three lines below

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter


        last_vs_3_5_loop:

        movq        mm0,    QWORD ptr [rsi]         // src[0];
        movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];

        movq        mm2,    mm0                     // Make a copy
        punpcklbw   mm0,    mm7                     // unpack low to word

        movq        mm5,    two_fifths               // mm5 = 2/5
        punpckhbw   mm2,    mm7                     // unpack high to word

        pmullw      mm0,    mm5                     // a * 2/5

        movq        mm3,    mm1                     // make a copy
        punpcklbw   mm1,    mm7                     // unpack low to word

        pmullw      mm2,    mm5                     // a * 2/5
        movq        mm6,    three_fifths             // mm6 = 3/5

        movq        mm4,    mm1                     // copy of low b
        pmullw      mm4,    mm6                     // b * 3/5

        punpckhbw   mm3,    mm7                     // unpack high to word
        movq        mm5,    mm3                     // copy of high b

        pmullw      mm5,    mm6                     // b * 3/5
        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5

        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
        paddw       mm0,    round_values             // + 128

        paddw       mm2,    round_values             // + 128
        psrlw       mm0,    8

        psrlw       mm2,    8
        packuswb    mm0,    mm2                     // des [1]

        movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
        movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]


        // mm1, mm3 --- Src[1]
        // mm0 --- Src[2]
        // mm7 for unpacking

        movq        mm4,    mm1                     // b low
        pmullw      mm1,    four_fifths              // b * 4/5 low

        movq        QWORD ptr [rdi+rcx], mm0        // write des[4]

        movq        mm5,    mm3                     // b high
        pmullw      mm3,    four_fifths              // b * 4/5 high

        movq        mm2,    mm0                     // c
        pmullw      mm4,    one_fifth                // b * 1/5

        punpcklbw   mm0,    mm7                     // c low
        pmullw      mm5,    one_fifth                // b * 1/5

        movq        mm6,    mm0                     // make copy of c low
        punpckhbw   mm2,    mm7                     // c high

        pmullw      mm6,    one_fifth                // c * 1/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    one_fifth                // c * 1/5 high
        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low

        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
        movq        mm6,    mm0                     // make copy of c low

        pmullw      mm6,    four_fifths              // c * 4/5 low
        movq        mm7,    mm2                     // make copy of c high

        pmullw      mm7,    four_fifths              // c * 4/5 high

        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high

        paddw       mm1,    round_values             // + 128
        paddw       mm3,    round_values             // + 128

        psrlw       mm1,    8
        psrlw       mm3,    8

        packuswb    mm1,    mm3                     // des[2]
        movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]

        paddw       mm4,    round_values             // + 128
        paddw       mm5,    round_values             // + 128

        psrlw       mm4,    8
        psrlw       mm5,    8

        packuswb    mm4,    mm5                     // des[3]
        movq        QWORD ptr [rdi], mm4            // write des[3]

        //  mm0, mm2 --- Src[3]

        add         rdi,    8
        add         rsi,    8

        sub         rdx,    8
        jg          last_vs_3_5_loop
    }
}

/****************************************************************************
*
*  ROUTINE       : vertical_band_1_2_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has an "C" only
*                  version.
*
****************************************************************************/
static
void vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {

        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        pxor        mm7,    mm7                     // clear out mm7
        mov         edx,    dest_width               // Loop counter

        vs_1_2_loop:

        movq        mm0,    [rsi]                   // get Src[0]
        movq        mm1,    [rsi + rcx * 2]         // get Src[1]

        movq        mm2,    mm0                     // make copy before unpack
        movq        mm3,    mm1                     // make copy before unpack

        punpcklbw   mm0,    mm7                     // low Src[0]
        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1

        punpcklbw   mm1,    mm7                     // low Src[1]
        paddw       mm0,    mm1                     // low (a + b)

        punpckhbw   mm2,    mm7                     // high Src[0]
        paddw       mm0,    mm6                     // low (a + b + 1)

        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3                     // high (a + b )

        psraw       mm0,    1                       // low (a + b +1 )/2
        paddw       mm2,    mm6                     // high (a + b + 1)

        psraw       mm2,    1                       // high (a + b + 1)/2
        packuswb    mm0,    mm2                     // pack results

        movq        [rsi+rcx], mm0                  // write out eight bytes
        add         rsi,    8

        sub         rdx,    8
        jg          vs_1_2_loop
    }

}

/****************************************************************************
*
*  ROUTINE       : last_vertical_band_1_2_scale_mmx
*
*  INPUTS        : unsigned char *dest    :
*                  unsigned int dest_pitch :
*                  unsigned int dest_width :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 1 to 2 up-scaling of band of pixels.
*
*  SPECIAL NOTES : The routine uses the first line of the band below
*                  the current band. The function also has an "C" only
*                  version.
*
****************************************************************************/
static
void last_vertical_band_1_2_scale_mmx
(
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         rsi,    dest                    // Get the source and destination pointer
        mov         ecx,    dest_pitch               // Get the pitch size

        mov         edx,    dest_width               // Loop counter

        last_vs_1_2_loop:

        movq        mm0,    [rsi]                   // get Src[0]
        movq        [rsi+rcx], mm0                  // write out eight bytes

        add         rsi,    8
        sub         rdx,    8

        jg          last_vs_1_2_loop
    }
}

/****************************************************************************
*
*  ROUTINE       : horizontal_line_1_2_scale
*
*  INPUTS        : const unsigned char *source :
*                  unsigned int source_width    :
*                  unsigned char *dest         :
*                  unsigned int dest_width      :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
*
*  SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_1_2_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void) dest_width;

    __asm
    {
        mov         rsi,    source
        mov         rdi,    dest

        pxor        mm7,    mm7
        movq        mm6,    four_ones

        mov         ecx,    source_width

        hs_1_2_loop:

        movq        mm0,    [rsi]
        movq        mm1,    [rsi+1]

        movq        mm2,    mm0
        movq        mm3,    mm1

        movq        mm4,    mm0
        punpcklbw   mm0,    mm7

        punpcklbw   mm1,    mm7
        paddw       mm0,    mm1

        paddw       mm0,    mm6
        punpckhbw   mm2,    mm7

        punpckhbw   mm3,    mm7
        paddw       mm2,    mm3

        paddw       mm2,    mm6
        psraw       mm0,    1

        psraw       mm2,    1
        packuswb    mm0,    mm2

        movq        mm2,    mm4
        punpcklbw   mm2,    mm0

        movq        [rdi],  mm2
        punpckhbw   mm4,    mm0

        movq        [rdi+8], mm4
        add         rsi,    8

        add         rdi,    16
        sub         rcx,    8

        cmp         rcx,    8
        jg          hs_1_2_loop

// last eight pixel

        movq        mm0,    [rsi]
        movq        mm1,    mm0

        movq        mm2,    mm0
        movq        mm3,    mm1

        psrlq       mm1,    8
        psrlq       mm3,    56

        psllq       mm3,    56
        por         mm1,    mm3

        movq        mm3,    mm1
        movq        mm4,    mm0

        punpcklbw   mm0,    mm7
        punpcklbw   mm1,    mm7

        paddw       mm0,    mm1
        paddw       mm0,    mm6

        punpckhbw   mm2,    mm7
        punpckhbw   mm3,    mm7

        paddw       mm2,    mm3
        paddw       mm2,    mm6

        psraw       mm0,    1
        psraw       mm2,    1

        packuswb    mm0,    mm2
        movq        mm2,    mm4

        punpcklbw   mm2,    mm0
        movq        [rdi],  mm2

        punpckhbw   mm4,    mm0
        movq        [rdi+8], mm4
    }
}


__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };


/****************************************************************************
*
*  ROUTINE       : horizontal_line_5_4_scale_mmx
*
*  INPUTS        : const unsigned char *source : Pointer to source data.
*                  unsigned int source_width    : Stride of source.
*                  unsigned char *dest         : Pointer to destination data.
*                  unsigned int dest_width      : Stride of destination (NOT USED).
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : Copies horizontal line of pixels from source to
*                  destination scaling up by 4 to 5.
*
*  SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_5_4_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    /*
    unsigned i;
    unsigned int a, b, c, d, e;
    unsigned char *des = dest;
    const unsigned char *src = source;

    (void) dest_width;

    for ( i=0; i<source_width; i+=5 )
    {
        a = src[0];
        b = src[1];
        c = src[2];
        d = src[3];
        e = src[4];

        des[0] = a;
        des[1] = ((b*192 + c* 64 + 128)>>8);
        des[2] = ((c*128 + d*128 + 128)>>8);
        des[3] = ((d* 64 + e*192 + 128)>>8);

        src += 5;
        des += 4;
    }
    */
    __asm
    {

        mov         rsi,        source              ;
        mov         rdi,        dest                ;

        mov         ecx,        source_width         ;
        movq        mm5,        const54_1           ;

        pxor        mm7,        mm7                 ;
        movq        mm6,        const54_2           ;

        movq        mm4,        round_values         ;
        lea         rdx,        [rsi+rcx]           ;
        horizontal_line_5_4_loop:

        movq        mm0,        QWORD PTR  [rsi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psrlq       mm0,        8                   ;
        01 02 03 04 05 06 07 xx
        punpcklbw   mm1,        mm7                 ;
        xx 00 xx 01 xx 02 xx 03

        punpcklbw   mm0,        mm7                 ;
        xx 01 xx 02 xx 03 xx 04
        pmullw      mm1,        mm5

        pmullw      mm0,        mm6
        add         rsi,        5

        add         rdi,        4
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        cmp         rsi,        rdx
        packuswb    mm1,        mm7

        movd        DWORD PTR [rdi-4], mm1

        jl          horizontal_line_5_4_loop

    }

}
__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };

static
void vertical_band_5_4_scale_mmx
(
    unsigned char *source,
    unsigned int src_pitch,
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{

    __asm
    {

        mov         rsi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size

        mov         rdi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7

        mov         edx,    dest_pitch               // Loop counter
        mov         ebx,    dest_width

        vs_5_4_loop:

        movd        mm0,    DWORD ptr [rsi]         // src[0];
        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];

        movd        mm2,    DWORD ptr [rsi+rcx*2]
        lea         rax,    [rsi+rcx*2]             //

        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7

        movq        mm3,    mm2
        pmullw      mm1,    three_fourths

        pmullw      mm2,    one_fourths
        movd        mm4,    [rax+rcx]

        pmullw      mm3,    two_fourths
        punpcklbw   mm4,    mm7

        movq        mm5,    mm4
        pmullw      mm4,    two_fourths

        paddw       mm1,    mm2
        movd        mm6,    [rax+rcx*2]

        pmullw      mm5,    one_fourths
        paddw       mm1,    round_values;

        paddw       mm3,    mm4
        psrlw       mm1,    8

        punpcklbw   mm6,    mm7
        paddw       mm3,    round_values

        pmullw      mm6,    three_fourths
        psrlw       mm3,    8

        packuswb    mm1,    mm7
        packuswb    mm3,    mm7

        movd        DWORD PTR [rdi], mm0
        movd        DWORD PTR [rdi+rdx], mm1


        paddw       mm5,    mm6
        movd        DWORD PTR [rdi+rdx*2], mm3

        lea         rax,    [rdi+rdx*2]
        paddw       mm5,    round_values

        psrlw       mm5,    8
        add         rdi,    4

        packuswb    mm5,    mm7
        movd        DWORD PTR [rax+rdx], mm5

        add         rsi,    4
        sub         rbx,    4

        jg         vs_5_4_loop
    }
}


__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };


static
void horizontal_line_5_3_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    __asm
    {

        mov         rsi,        source              ;
        mov         rdi,        dest                ;

        mov         ecx,        source_width         ;
        movq        mm5,        const53_1           ;

        pxor        mm7,        mm7                 ;
        movq        mm6,        const53_2           ;

        movq        mm4,        round_values         ;
        lea         rdx,        [rsi+rcx-5]         ;
        horizontal_line_5_3_loop:

        movq        mm0,        QWORD PTR  [rsi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx

        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx

        pmullw      mm0,        mm6

        pmullw      mm1,        mm5
        add         rsi,        5

        add         rdi,        3
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        cmp         rsi,        rdx
        packuswb    mm1,        mm7

        movd        DWORD PTR [rdi-3], mm1
        jl          horizontal_line_5_3_loop

//exit condition
        movq        mm0,        QWORD PTR  [rsi]    ;
        00 01 02 03 04 05 06 07
        movq        mm1,        mm0                 ;
        00 01 02 03 04 05 06 07

        psllw       mm0,        8                   ;
        xx 00 xx 02 xx 04 xx 06
        psrlw       mm1,        8                   ;
        01 xx 03 xx 05 xx 07 xx

        psrlw       mm0,        8                   ;
        00 xx 02 xx 04 xx 06 xx
        psllq       mm1,        16                  ;
        xx xx 01 xx 03 xx 05 xx

        pmullw      mm0,        mm6

        pmullw      mm1,        mm5
        paddw       mm1,        mm0

        paddw       mm1,        mm4
        psrlw       mm1,        8

        packuswb    mm1,        mm7
        movd        rax,        mm1

        mov         rdx,        rax
        shr         rdx,        16

        mov         WORD PTR[rdi],   ax
        mov         BYTE PTR[rdi+2], dl

    }

}

__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };

static
void vertical_band_5_3_scale_mmx
(
    unsigned char *source,
    unsigned int src_pitch,
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{

    __asm
    {

        mov         rsi,    source                    // Get the source and destination pointer
        mov         ecx,    src_pitch               // Get the pitch size

        mov         rdi,    dest                    // tow lines below
        pxor        mm7,    mm7                     // clear out mm7

        mov         edx,    dest_pitch               // Loop counter
        movq        mm5,    one_thirds

        movq        mm6,    two_thirds
        mov         ebx,    dest_width;

        vs_5_3_loop:

        movd        mm0,    DWORD ptr [rsi]         // src[0];
        movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];

        movd        mm2,    DWORD ptr [rsi+rcx*2]
        lea         rax,    [rsi+rcx*2]             //

        punpcklbw   mm1,    mm7
        punpcklbw   mm2,    mm7

        pmullw      mm1,    mm5
        pmullw      mm2,    mm6

        movd        mm3,    DWORD ptr [rax+rcx]
        movd        mm4,    DWORD ptr [rax+rcx*2]

        punpcklbw   mm3,    mm7
        punpcklbw   mm4,    mm7

        pmullw      mm3,    mm6
        pmullw      mm4,    mm5


        movd        DWORD PTR [rdi], mm0
        paddw       mm1,    mm2

        paddw       mm1,    round_values
        psrlw       mm1,    8

        packuswb    mm1,    mm7
        paddw       mm3,    mm4

        paddw       mm3,    round_values
        movd        DWORD PTR [rdi+rdx], mm1

        psrlw       mm3,    8
        packuswb    mm3,    mm7

        movd        DWORD PTR [rdi+rdx*2], mm3


        add         rdi,    4
        add         rsi,    4

        sub         rbx,    4
        jg          vs_5_3_loop
    }
}


/****************************************************************************
*
*  ROUTINE       : horizontal_line_2_1_scale
*
*  INPUTS        : const unsigned char *source :
*                  unsigned int source_width    :
*                  unsigned char *dest         :
*                  unsigned int dest_width      :
*
*  OUTPUTS       : None.
*
*  RETURNS       : void
*
*  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
*
*  SPECIAL NOTES : None.
*
****************************************************************************/
static
void horizontal_line_2_1_scale_mmx
(
    const unsigned char *source,
    unsigned int source_width,
    unsigned char *dest,
    unsigned int dest_width
)
{
    (void) dest_width;

    __asm
    {
        mov         rsi,    source
        mov         rdi,    dest

        pxor        mm7,    mm7
        mov         ecx,    dest_width

        xor         rdx,    rdx
        hs_2_1_loop:

        movq        mm0,    [rsi+rdx*2]
        psllw       mm0,    8

        psrlw       mm0,    8
        packuswb    mm0,    mm7

        movd        DWORD Ptr [rdi+rdx], mm0;
        add         rdx,    4

        cmp         rdx,    rcx
        jl          hs_2_1_loop

    }
}


static
void vertical_band_2_1_scale_mmx
(
    unsigned char *source,
    unsigned int src_pitch,
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width)
{
    vpx_memcpy(dest, source, dest_width);
}


__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };

static
void vertical_band_2_1_scale_i_mmx
(
    unsigned char *source,
    unsigned int src_pitch,
    unsigned char *dest,
    unsigned int dest_pitch,
    unsigned int dest_width
)
{
    __asm
    {
        mov         rsi,        source
        mov         rdi,        dest

        mov         eax,        src_pitch
        mov         edx,        dest_width

        pxor        mm7,        mm7
        sub         rsi,        rax             //back one line


        lea         rcx,        [rsi+rdx];
        movq        mm6,        round_values;

        movq        mm5,        three_sixteenths;
        movq        mm4,        ten_sixteenths;

        vs_2_1_i_loop:
        movd        mm0,        [rsi]           //
        movd        mm1,        [rsi+rax]       //

        movd        mm2,        [rsi+rax*2]     //
        punpcklbw   mm0,        mm7

        pmullw      mm0,        mm5
        punpcklbw   mm1,        mm7

        pmullw      mm1,        mm4
        punpcklbw   mm2,        mm7

        pmullw      mm2,        mm5
        paddw       mm0,        round_values

        paddw       mm1,        mm2
        paddw       mm0,        mm1

        psrlw       mm0,        8
        packuswb    mm0,        mm7

        movd        DWORD PTR [rdi],        mm0
        add         rsi,        4

        add         rdi,        4;
        cmp         rsi,        rcx
        jl          vs_2_1_i_loop

    }
}


void
register_mmxscalers(void)
{
    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;

    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
}