Home | History | Annotate | Download | only in intel_linux
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 /****************************************************************************
     13 *
     14 *   Module Title :     scaleopt.cpp
     15 *
     16 *   Description  :     Optimized scaling functions
     17 *
     18 ****************************************************************************/
     19 #include "pragmas.h"
     20 
     21 /****************************************************************************
     22 *  Module Statics
     23 ****************************************************************************/
     24 #if 0
     25 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
     26 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
     27 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
     28 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
     29 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
     30 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
     31 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
     32 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
     33 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
     34 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
     35 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
     36 #endif
     37 
     38 #include "vpx_scale/vpxscale.h"
     39 #include "vpx_mem/vpx_mem.h"
     40 
     41 /****************************************************************************
     42  *
     43  *  ROUTINE       : horizontal_line_3_5_scale_mmx
     44  *
     45  *  INPUTS        : const unsigned char *source :
     46  *                  unsigned int source_width    :
     47  *                  unsigned char *dest         :
     48  *                  unsigned int dest_width      :
     49  *
     50  *  OUTPUTS       : None.
     51  *
     52  *  RETURNS       : void
     53  *
     54  *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
     55  *
     56  *  SPECIAL NOTES : None.
     57  *
     58  ****************************************************************************/
     59 static
     60 void horizontal_line_3_5_scale_mmx
     61 (
     62     const unsigned char *source,
     63     unsigned int source_width,
     64     unsigned char *dest,
     65     unsigned int dest_width
     66 )
     67 {
     68     __declspec(align(16)) unsigned short const35_2[] = { 154,  51, 205, 102 };
     69     __declspec(align(16)) unsigned short const35_1[] = { 102, 205,  51, 154 };
     70     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
     71 
     72     (void) dest_width;
     73 
     74     __asm
     75     {
     76 
     77         push ebx
     78 
     79         mov         esi,    source
     80         mov         edi,    dest
     81 
     82         mov         ecx,    source_width
     83         lea         edx,    [esi+ecx-3];
     84 
     85         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
     86         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
     87 
     88         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
     89         pxor        mm7,    mm7             // clear mm7
     90 
     91         horiz_line_3_5_loop:
     92 
     93         mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
     94         mov        ebx,    eax
     95 
     96         and         ebx,    0xffff00        // ebx = xx 01 02 xx
     97         mov         ecx,    eax             // ecx = 00 01 02 03
     98 
     99         and         eax,    0xffff0000      // eax = xx xx 02 03
    100         xor         ecx,    eax             // ecx = 00 01 xx xx
    101 
    102         shr         ebx,    8               // ebx = 01 02 xx xx
    103         or          eax,    ebx             // eax = 01 02 02 03
    104 
    105         shl         ebx,    16              // ebx = xx xx 01 02
    106         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
    107 
    108         or          ebx,    ecx             // ebx = 00 01 01 02
    109         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
    110 
    111         movd        mm0,    ebx             // mm0 = 00 01 01 02
    112         pmullw      mm1,    mm6             //
    113 
    114         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    115         pmullw      mm0,    mm5             //
    116 
    117         mov         [edi],  ebx             // writeoutput 00 xx xx xx
    118         add         esi,    3
    119 
    120         add         edi,    5
    121         paddw       mm0,    mm1
    122 
    123         paddw       mm0,    mm4
    124         psrlw       mm0,    8
    125 
    126         cmp         esi,    edx
    127         packuswb    mm0,    mm7
    128 
    129         movd        DWORD Ptr [edi-4], mm0
    130         jl          horiz_line_3_5_loop
    131 
    132 //Exit:
    133         mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
    134         mov         ebx,    eax
    135 
    136         and         ebx,    0xffff00        // ebx = xx 01 02 xx
    137         mov         ecx,    eax             // ecx = 00 01 02 03
    138 
    139         and         eax,    0xffff0000      // eax = xx xx 02 03
    140         xor         ecx,    eax             // ecx = 00 01 xx xx
    141 
    142         shr         ebx,    8               // ebx = 01 02 xx xx
    143         or          eax,    ebx             // eax = 01 02 02 03
    144 
    145         shl         eax,    8               // eax = xx 01 02 02
    146         and         eax,    0xffff0000      // eax = xx xx 02 02
    147 
    148         or          eax,    ebx             // eax = 01 02 02 02
    149 
    150         shl         ebx,    16              // ebx = xx xx 01 02
    151         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
    152 
    153         or          ebx,    ecx             // ebx = 00 01 01 02
    154         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
    155 
    156         movd        mm0,    ebx             // mm0 = 00 01 01 02
    157         pmullw      mm1,    mm6             //
    158 
    159         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    160         pmullw      mm0,    mm5             //
    161 
    162         mov         [edi],  ebx             // writeoutput 00 xx xx xx
    163         paddw       mm0,    mm1
    164 
    165         paddw       mm0,    mm4
    166         psrlw       mm0,    8
    167 
    168         packuswb    mm0,    mm7
    169         movd        DWORD Ptr [edi+1], mm0
    170 
    171         pop ebx
    172 
    173     }
    174 
    175     /*
    176     const unsigned char *src = source;
    177     unsigned char *des = dest;
    178     unsigned int a, b, c ;
    179     unsigned int i;
    180     (void) dest_width;
    181 
    182     for ( i=0; i<source_width-3; i+=3 )
    183     {
    184         a = src[0];
    185         b = src[1];
    186         des [0] = (UINT8) (a);
    187         // 2 * left + 3 * right /5
    188         des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
    189         c = src[2] ;
    190         // 4 * left + 1 * right /5
    191         des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
    192         // 1 * left + 4 * right /5
    193         des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
    194 
    195         a = src[3];
    196         // 3 * left + 2 * right /5
    197         des [4] = (UINT8) (( c * 154 + a * 102 + 128 ) >> 8);
    198 
    199         src += 3;
    200         des += 5;
    201     }
    202 
    203     a = src[0];
    204     b = src[1];
    205     des [0] = (UINT8) (a);
    206     // 2 * left + 3 * right /5
    207     des [1] = (UINT8) (( a * 102 + 154 * b + 128 ) >> 8);
    208     c = src[2] ;
    209     // 4 * left + 1 * right /5
    210     des [2] = (UINT8) (( b * 205 + c * 51 + 128 ) >> 8);
    211     // 1 * left + 4 * right /5
    212     des [3] = (UINT8) (( b * 51 + c * 205 + 128 ) >> 8);
    213 
    214     des [4] = (UINT8) (c);
    215     */
    216 }
    217 
    218 
    219 /****************************************************************************
    220  *
    221  *  ROUTINE       : horizontal_line_4_5_scale_mmx
    222  *
    223  *  INPUTS        : const unsigned char *source :
    224  *                  unsigned int source_width    :
    225  *                  unsigned char *dest         :
    226  *                  unsigned int dest_width      :
    227  *
    228  *  OUTPUTS       : None.
    229  *
    230  *  RETURNS       : void
    231  *
    232  *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
    233  *
    234  *  SPECIAL NOTES : None.
    235  *
    236  ****************************************************************************/
    237 static
    238 void horizontal_line_4_5_scale_mmx
    239 (
    240     const unsigned char *source,
    241     unsigned int source_width,
    242     unsigned char *dest,
    243     unsigned int dest_width
    244 )
    245 {
    246     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    247     __declspec(align(16)) unsigned short const45_2[] = {205, 154, 102,  51 };
    248     __declspec(align(16)) unsigned short const45_1[] = { 51, 102, 154, 205 };
    249     __declspec(align(16)) unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
    250 
    251     (void)dest_width;
    252 
    253     __asm
    254     {
    255 
    256         mov         esi,    source
    257         mov         edi,    dest
    258 
    259         mov         ecx,    source_width
    260         lea         edx,    [esi+ecx-8];
    261 
    262         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
    263         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
    264 
    265         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
    266         pxor        mm7,    mm7             // clear mm7
    267 
    268         horiz_line_4_5_loop:
    269 
    270         movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
    271         movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
    272 
    273         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    274         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
    275 
    276         movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
    277         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    278 
    279         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    280         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    281 
    282         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    283         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    284 
    285         movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
    286         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    287 
    288         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    289         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
    290 
    291         paddw       mm0,    mm1             // added round values
    292         paddw       mm0,    mm4
    293 
    294         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    295         packuswb    mm0,    mm7
    296 
    297         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    298         add         edi,    10
    299 
    300         add         esi,    8
    301         paddw       mm2,    mm3             //
    302 
    303         paddw       mm2,    mm4             // added round values
    304         cmp         esi,    edx
    305 
    306         psrlw       mm2,    8
    307         packuswb    mm2,    mm7
    308 
    309         movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
    310         jl         horiz_line_4_5_loop
    311 
    312 //Exit:
    313         movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
    314         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
    315 
    316         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    317         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
    318 
    319         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
    320         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
    321 
    322         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
    323         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
    324 
    325         movq        mm3,    mm1
    326 
    327         movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
    328         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    329 
    330         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    331         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    332 
    333         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    334         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    335 
    336         movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
    337         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    338 
    339         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    340         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
    341 
    342         paddw       mm0,    mm1             // added round values
    343         paddw       mm0,    mm4
    344 
    345         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    346         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
    347 
    348         movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
    349         paddw       mm2,    mm3             //
    350 
    351         paddw       mm2,    mm4             // added round values
    352         psrlw       mm2,    8
    353 
    354         packuswb    mm2,    mm7
    355         movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
    356 
    357 
    358     }
    359     /*
    360         const unsigned char *src = source;
    361         unsigned char *des = dest;
    362         unsigned int a, b, c ;
    363         unsigned i;
    364         (void) dest_width;
    365 
    366         for ( i=0; i<source_width-4; i+=4 )
    367         {
    368             a = src[0];
    369             b = src[1];
    370             des [0] = (UINT8) a;
    371             des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
    372             c = src[2] * 154;
    373             a = src[3];
    374             des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
    375             des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
    376             b = src[4];
    377             des [4] = (UINT8) (( a * 205 + 51 * b + 128) >> 8);
    378 
    379             src += 4;
    380             des += 5;
    381         }
    382 
    383         a = src[0];
    384         b = src[1];
    385         des [0] = (UINT8) (a);
    386         des [1] = (UINT8) (( a * 51 + 205 * b + 128) >> 8);
    387         c = src[2] * 154;
    388         a = src[3];
    389         des [2] = (UINT8) (( b * 102 + c + 128) >> 8);
    390         des [3] = (UINT8) (( c + 102 * a + 128) >> 8);
    391         des [4] = (UINT8) (a);
    392     */
    393 }
    394 
    395 /****************************************************************************
    396  *
    397  *  ROUTINE       : vertical_band_4_5_scale_mmx
    398  *
    399  *  INPUTS        : unsigned char *dest    :
    400  *                  unsigned int dest_pitch :
    401  *                  unsigned int dest_width :
    402  *
    403  *  OUTPUTS       : None.
    404  *
    405  *  RETURNS       : void
    406  *
    407  *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
    408  *
    409  *  SPECIAL NOTES : The routine uses the first line of the band below
    410  *                  the current band. The function also has a "C" only
    411  *                  version.
    412  *
    413  ****************************************************************************/
    414 static
    415 void vertical_band_4_5_scale_mmx
    416 (
    417     unsigned char *dest,
    418     unsigned int dest_pitch,
    419     unsigned int dest_width
    420 )
    421 {
    422 
    423     __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    424     __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    425     __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    426     __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    427     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    428 
    429     __asm
    430     {
    431 
    432         mov         esi,    dest                    // Get the source and destination pointer
    433         mov         ecx,    dest_pitch               // Get the pitch size
    434 
    435         lea         edi,    [esi+ecx*2]             // tow lines below
    436         add         edi,    ecx                     // three lines below
    437 
    438         pxor        mm7,    mm7                     // clear out mm7
    439         mov         edx,    dest_width               // Loop counter
    440 
    441         vs_4_5_loop:
    442 
    443         movq        mm0,    QWORD ptr [esi]         // src[0];
    444         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    445 
    446         movq        mm2,    mm0                     // Make a copy
    447         punpcklbw   mm0,    mm7                     // unpack low to word
    448 
    449         movq        mm5,    one_fifth
    450         punpckhbw   mm2,    mm7                     // unpack high to word
    451 
    452         pmullw      mm0,    mm5                     // a * 1/5
    453 
    454         movq        mm3,    mm1                     // make a copy
    455         punpcklbw   mm1,    mm7                     // unpack low to word
    456 
    457         pmullw      mm2,    mm5                     // a * 1/5
    458         movq        mm6,    four_fifths               // constan
    459 
    460         movq        mm4,    mm1                     // copy of low b
    461         pmullw      mm4,    mm6                     // b * 4/5
    462 
    463         punpckhbw   mm3,    mm7                     // unpack high to word
    464         movq        mm5,    mm3                     // copy of high b
    465 
    466         pmullw      mm5,    mm6                     // b * 4/5
    467         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    468 
    469         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    470         paddw       mm0,    round_values             // + 128
    471 
    472         paddw       mm2,    round_values             // + 128
    473         psrlw       mm0,    8
    474 
    475         psrlw       mm2,    8
    476         packuswb    mm0,    mm2                     // des [1]
    477 
    478         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    479         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    480 
    481         // mm1, mm3 --- Src[1]
    482         // mm0 --- Src[2]
    483         // mm7 for unpacking
    484 
    485         movq        mm5,    two_fifths
    486         movq        mm2,    mm0                     // make a copy
    487 
    488         pmullw      mm1,    mm5                     // b * 2/5
    489         movq        mm6,    three_fifths
    490 
    491 
    492         punpcklbw   mm0,    mm7                     // unpack low to word
    493         pmullw      mm3,    mm5                     // b * 2/5
    494 
    495         movq        mm4,    mm0                     // make copy of c
    496         punpckhbw   mm2,    mm7                     // unpack high to word
    497 
    498         pmullw      mm4,    mm6                     // c * 3/5
    499         movq        mm5,    mm2
    500 
    501         pmullw      mm5,    mm6                     // c * 3/5
    502         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    503 
    504         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    505         paddw       mm1,    round_values             // + 128
    506 
    507         paddw       mm3,    round_values             // + 128
    508         psrlw       mm1,    8
    509 
    510         psrlw       mm3,    8
    511         packuswb    mm1,    mm3                     // des[2]
    512 
    513         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    514         movq        mm1,    [edi]                   // mm1=Src[3];
    515 
    516         // mm0, mm2 --- Src[2]
    517         // mm1 --- Src[3]
    518         // mm6 --- 3/5
    519         // mm7 for unpacking
    520 
    521         pmullw      mm0,    mm6                     // c * 3/5
    522         movq        mm5,    two_fifths               // mm5 = 2/5
    523 
    524         movq        mm3,    mm1                     // make a copy
    525         pmullw      mm2,    mm6                     // c * 3/5
    526 
    527         punpcklbw   mm1,    mm7                     // unpack low
    528         movq        mm4,    mm1                     // make a copy
    529 
    530         punpckhbw   mm3,    mm7                     // unpack high
    531         pmullw      mm4,    mm5                     // d * 2/5
    532 
    533         movq        mm6,    mm3                     // make a copy
    534         pmullw      mm6,    mm5                     // d * 2/5
    535 
    536         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    537         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    538 
    539         paddw       mm0,    round_values             // + 128
    540         paddw       mm2,    round_values             // + 128
    541 
    542         psrlw       mm0,    8
    543         psrlw       mm2,    8
    544 
    545         packuswb    mm0,    mm2                     // des[3]
    546         movq        QWORD ptr [edi], mm0            // write des[3]
    547 
    548         //  mm1, mm3 --- Src[3]
    549         //  mm7 -- cleared for unpacking
    550 
    551         movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
    552 
    553         movq        mm5,    four_fifths              // mm5 = 4/5
    554         pmullw      mm1,    mm5                     // d * 4/5
    555 
    556         movq        mm6,    one_fifth                // mm6 = 1/5
    557         movq        mm2,    mm0                     // make a copy
    558 
    559         pmullw      mm3,    mm5                     // d * 4/5
    560         punpcklbw   mm0,    mm7                     // unpack low
    561 
    562         pmullw      mm0,    mm6                     // an * 1/5
    563         punpckhbw   mm2,    mm7                     // unpack high
    564 
    565         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
    566         pmullw      mm2,    mm6                     // an * 1/5
    567 
    568         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
    569         paddw       mm1,    round_values             // + 128
    570 
    571         paddw       mm3,    round_values             // + 128
    572         psrlw       mm1,    8
    573 
    574         psrlw       mm3,    8
    575         packuswb    mm1,    mm3                     // des[4]
    576 
    577         movq        QWORD ptr [edi+ecx], mm1        // write des[4]
    578 
    579         add         edi,    8
    580         add         esi,    8
    581 
    582         sub         edx,    8
    583         jg         vs_4_5_loop
    584     }
    585 }
    586 
    587 /****************************************************************************
    588  *
    589  *  ROUTINE       : last_vertical_band_4_5_scale_mmx
    590  *
    591  *  INPUTS        : unsigned char *dest    :
    592  *                  unsigned int dest_pitch :
    593  *                  unsigned int dest_width :
    594  *
    595  *  OUTPUTS       : None.
    596  *
    597  *  RETURNS       : None
    598  *
    599  *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
    600  *
    601  *  SPECIAL NOTES : The routine uses the first line of the band below
    602  *                  the current band. The function also has an "C" only
    603  *                  version.
    604  *
    605  ****************************************************************************/
    606 static
    607 void last_vertical_band_4_5_scale_mmx
    608 (
    609     unsigned char *dest,
    610     unsigned int dest_pitch,
    611     unsigned int dest_width
    612 )
    613 {
    614     __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    615     __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    616     __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    617     __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    618     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    619 
    620     __asm
    621     {
    622         mov         esi,    dest                    // Get the source and destination pointer
    623         mov         ecx,    dest_pitch               // Get the pitch size
    624 
    625         lea         edi,    [esi+ecx*2]             // tow lines below
    626         add         edi,    ecx                     // three lines below
    627 
    628         pxor        mm7,    mm7                     // clear out mm7
    629         mov         edx,    dest_width               // Loop counter
    630 
    631         last_vs_4_5_loop:
    632 
    633         movq        mm0,    QWORD ptr [esi]         // src[0];
    634         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    635 
    636         movq        mm2,    mm0                     // Make a copy
    637         punpcklbw   mm0,    mm7                     // unpack low to word
    638 
    639         movq        mm5,    one_fifth
    640         punpckhbw   mm2,    mm7                     // unpack high to word
    641 
    642         pmullw      mm0,    mm5                     // a * 1/5
    643 
    644         movq        mm3,    mm1                     // make a copy
    645         punpcklbw   mm1,    mm7                     // unpack low to word
    646 
    647         pmullw      mm2,    mm5                     // a * 1/5
    648         movq        mm6,    four_fifths               // constan
    649 
    650         movq        mm4,    mm1                     // copy of low b
    651         pmullw      mm4,    mm6                     // b * 4/5
    652 
    653         punpckhbw   mm3,    mm7                     // unpack high to word
    654         movq        mm5,    mm3                     // copy of high b
    655 
    656         pmullw      mm5,    mm6                     // b * 4/5
    657         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    658 
    659         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    660         paddw       mm0,    round_values             // + 128
    661 
    662         paddw       mm2,    round_values             // + 128
    663         psrlw       mm0,    8
    664 
    665         psrlw       mm2,    8
    666         packuswb    mm0,    mm2                     // des [1]
    667 
    668         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    669         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    670 
    671         // mm1, mm3 --- Src[1]
    672         // mm0 --- Src[2]
    673         // mm7 for unpacking
    674 
    675         movq        mm5,    two_fifths
    676         movq        mm2,    mm0                     // make a copy
    677 
    678         pmullw      mm1,    mm5                     // b * 2/5
    679         movq        mm6,    three_fifths
    680 
    681 
    682         punpcklbw   mm0,    mm7                     // unpack low to word
    683         pmullw      mm3,    mm5                     // b * 2/5
    684 
    685         movq        mm4,    mm0                     // make copy of c
    686         punpckhbw   mm2,    mm7                     // unpack high to word
    687 
    688         pmullw      mm4,    mm6                     // c * 3/5
    689         movq        mm5,    mm2
    690 
    691         pmullw      mm5,    mm6                     // c * 3/5
    692         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    693 
    694         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    695         paddw       mm1,    round_values             // + 128
    696 
    697         paddw       mm3,    round_values             // + 128
    698         psrlw       mm1,    8
    699 
    700         psrlw       mm3,    8
    701         packuswb    mm1,    mm3                     // des[2]
    702 
    703         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    704         movq        mm1,    [edi]                   // mm1=Src[3];
    705 
    706         movq        QWORD ptr [edi+ecx], mm1        // write des[4];
    707 
    708         // mm0, mm2 --- Src[2]
    709         // mm1 --- Src[3]
    710         // mm6 --- 3/5
    711         // mm7 for unpacking
    712 
    713         pmullw      mm0,    mm6                     // c * 3/5
    714         movq        mm5,    two_fifths               // mm5 = 2/5
    715 
    716         movq        mm3,    mm1                     // make a copy
    717         pmullw      mm2,    mm6                     // c * 3/5
    718 
    719         punpcklbw   mm1,    mm7                     // unpack low
    720         movq        mm4,    mm1                     // make a copy
    721 
    722         punpckhbw   mm3,    mm7                     // unpack high
    723         pmullw      mm4,    mm5                     // d * 2/5
    724 
    725         movq        mm6,    mm3                     // make a copy
    726         pmullw      mm6,    mm5                     // d * 2/5
    727 
    728         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    729         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    730 
    731         paddw       mm0,    round_values             // + 128
    732         paddw       mm2,    round_values             // + 128
    733 
    734         psrlw       mm0,    8
    735         psrlw       mm2,    8
    736 
    737         packuswb    mm0,    mm2                     // des[3]
    738         movq        QWORD ptr [edi], mm0            // write des[3]
    739 
    740         //  mm1, mm3 --- Src[3]
    741         //  mm7 -- cleared for unpacking
    742         add         edi,    8
    743         add         esi,    8
    744 
    745         sub         edx,    8
    746         jg          last_vs_4_5_loop
    747     }
    748 }
    749 
    750 /****************************************************************************
    751  *
    752  *  ROUTINE       : vertical_band_3_5_scale_mmx
    753  *
    754  *  INPUTS        : unsigned char *dest    :
    755  *                  unsigned int dest_pitch :
    756  *                  unsigned int dest_width :
    757  *
    758  *  OUTPUTS       : None.
    759  *
    760  *  RETURNS       : void
    761  *
    762  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    763  *
    764  *  SPECIAL NOTES : The routine uses the first line of the band below
    765  *                  the current band. The function also has an "C" only
    766  *                  version.
    767  *
    768  ****************************************************************************/
    769 static
    770 void vertical_band_3_5_scale_mmx
    771 (
    772     unsigned char *dest,
    773     unsigned int dest_pitch,
    774     unsigned int dest_width
    775 )
    776 {
    777     __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    778     __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    779     __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    780     __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    781     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    782 
    783     __asm
    784     {
    785         mov         esi,    dest                    // Get the source and destination pointer
    786         mov         ecx,    dest_pitch               // Get the pitch size
    787 
    788         lea         edi,    [esi+ecx*2]             // tow lines below
    789         add         edi,    ecx                     // three lines below
    790 
    791         pxor        mm7,    mm7                     // clear out mm7
    792         mov         edx,    dest_width               // Loop counter
    793 
    794         vs_3_5_loop:
    795 
    796         movq        mm0,    QWORD ptr [esi]         // src[0];
    797         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    798 
    799         movq        mm2,    mm0                     // Make a copy
    800         punpcklbw   mm0,    mm7                     // unpack low to word
    801 
    802         movq        mm5,    two_fifths               // mm5 = 2/5
    803         punpckhbw   mm2,    mm7                     // unpack high to word
    804 
    805         pmullw      mm0,    mm5                     // a * 2/5
    806 
    807         movq        mm3,    mm1                     // make a copy
    808         punpcklbw   mm1,    mm7                     // unpack low to word
    809 
    810         pmullw      mm2,    mm5                     // a * 2/5
    811         movq        mm6,    three_fifths             // mm6 = 3/5
    812 
    813         movq        mm4,    mm1                     // copy of low b
    814         pmullw      mm4,    mm6                     // b * 3/5
    815 
    816         punpckhbw   mm3,    mm7                     // unpack high to word
    817         movq        mm5,    mm3                     // copy of high b
    818 
    819         pmullw      mm5,    mm6                     // b * 3/5
    820         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    821 
    822         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    823         paddw       mm0,    round_values             // + 128
    824 
    825         paddw       mm2,    round_values             // + 128
    826         psrlw       mm0,    8
    827 
    828         psrlw       mm2,    8
    829         packuswb    mm0,    mm2                     // des [1]
    830 
    831         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
    832         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
    833 
    834         // mm1, mm3 --- Src[1]
    835         // mm0 --- Src[2]
    836         // mm7 for unpacking
    837 
    838         movq        mm4,    mm1                     // b low
    839         pmullw      mm1,    four_fifths              // b * 4/5 low
    840 
    841         movq        mm5,    mm3                     // b high
    842         pmullw      mm3,    four_fifths              // b * 4/5 high
    843 
    844         movq        mm2,    mm0                     // c
    845         pmullw      mm4,    one_fifth                // b * 1/5
    846 
    847         punpcklbw   mm0,    mm7                     // c low
    848         pmullw      mm5,    one_fifth                // b * 1/5
    849 
    850         movq        mm6,    mm0                     // make copy of c low
    851         punpckhbw   mm2,    mm7                     // c high
    852 
    853         pmullw      mm6,    one_fifth                // c * 1/5 low
    854         movq        mm7,    mm2                     // make copy of c high
    855 
    856         pmullw      mm7,    one_fifth                // c * 1/5 high
    857         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
    858 
    859         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    860         movq        mm6,    mm0                     // make copy of c low
    861 
    862         pmullw      mm6,    four_fifths              // c * 4/5 low
    863         movq        mm7,    mm2                     // make copy of c high
    864 
    865         pmullw      mm7,    four_fifths              // c * 4/5 high
    866 
    867         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    868         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
    869 
    870         paddw       mm1,    round_values             // + 128
    871         paddw       mm3,    round_values             // + 128
    872 
    873         psrlw       mm1,    8
    874         psrlw       mm3,    8
    875 
    876         packuswb    mm1,    mm3                     // des[2]
    877         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
    878 
    879         paddw       mm4,    round_values             // + 128
    880         paddw       mm5,    round_values             // + 128
    881 
    882         psrlw       mm4,    8
    883         psrlw       mm5,    8
    884 
    885         packuswb    mm4,    mm5                     // des[3]
    886         movq        QWORD ptr [edi], mm4            // write des[3]
    887 
    888         //  mm0, mm2 --- Src[3]
    889 
    890         pxor        mm7,    mm7                     // clear mm7 for unpacking
    891         movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
    892 
    893         movq        mm5,    three_fifths             // mm5 = 3/5
    894         pmullw      mm0,    mm5                     // d * 3/5
    895 
    896         movq        mm6,    two_fifths                // mm6 = 2/5
    897         movq        mm3,    mm1                     // make a copy
    898 
    899         pmullw      mm2,    mm5                     // d * 3/5
    900         punpcklbw   mm1,    mm7                     // unpack low
    901 
    902         pmullw      mm1,    mm6                     // an * 2/5
    903         punpckhbw   mm3,    mm7                     // unpack high
    904 
    905         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
    906         pmullw      mm3,    mm6                     // an * 2/5
    907 
    908         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
    909         paddw       mm0,    round_values             // + 128
    910 
    911         paddw       mm2,    round_values             // + 128
    912         psrlw       mm0,    8
    913 
    914         psrlw       mm2,    8
    915         packuswb    mm0,    mm2                     // des[4]
    916 
    917         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
    918 
    919         add         edi,    8
    920         add         esi,    8
    921 
    922         sub         edx,    8
    923         jg          vs_3_5_loop
    924     }
    925 }
    926 
    927 /****************************************************************************
    928  *
    929  *  ROUTINE       : last_vertical_band_3_5_scale_mmx
    930  *
    931  *  INPUTS        : unsigned char *dest    :
    932  *                  unsigned int dest_pitch :
    933  *                  unsigned int dest_width :
    934  *
    935  *  OUTPUTS       : None.
    936  *
    937  *  RETURNS       : void
    938  *
    939  *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    940  *
    941  *  SPECIAL NOTES : The routine uses the first line of the band below
    942  *                  the current band. The function also has an "C" only
    943  *                  version.
    944  *
    945  ****************************************************************************/
    946 static
    947 void last_vertical_band_3_5_scale_mmx
    948 (
    949     unsigned char *dest,
    950     unsigned int dest_pitch,
    951     unsigned int dest_width
    952 )
    953 {
    954     __declspec(align(16)) unsigned short one_fifth[]  = { 51, 51, 51, 51 };
    955     __declspec(align(16)) unsigned short two_fifths[] = { 102, 102, 102, 102 };
    956     __declspec(align(16)) unsigned short three_fifths[] = { 154, 154, 154, 154 };
    957     __declspec(align(16)) unsigned short four_fifths[] = { 205, 205, 205, 205 };
    958     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
    959     __asm
    960     {
    961         mov         esi,    dest                    // Get the source and destination pointer
    962         mov         ecx,    dest_pitch               // Get the pitch size
    963 
    964         lea         edi,    [esi+ecx*2]             // tow lines below
    965         add         edi,    ecx                     // three lines below
    966 
    967         pxor        mm7,    mm7                     // clear out mm7
    968         mov         edx,    dest_width               // Loop counter
    969 
    970 
    971         last_vs_3_5_loop:
    972 
    973         movq        mm0,    QWORD ptr [esi]         // src[0];
    974         movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
    975 
    976         movq        mm2,    mm0                     // Make a copy
    977         punpcklbw   mm0,    mm7                     // unpack low to word
    978 
    979         movq        mm5,    two_fifths               // mm5 = 2/5
    980         punpckhbw   mm2,    mm7                     // unpack high to word
    981 
    982         pmullw      mm0,    mm5                     // a * 2/5
    983 
    984         movq        mm3,    mm1                     // make a copy
    985         punpcklbw   mm1,    mm7                     // unpack low to word
    986 
    987         pmullw      mm2,    mm5                     // a * 2/5
    988         movq        mm6,    three_fifths             // mm6 = 3/5
    989 
    990         movq        mm4,    mm1                     // copy of low b
    991         pmullw      mm4,    mm6                     // b * 3/5
    992 
    993         punpckhbw   mm3,    mm7                     // unpack high to word
    994         movq        mm5,    mm3                     // copy of high b
    995 
    996         pmullw      mm5,    mm6                     // b * 3/5
    997         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    998 
    999         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
   1000         paddw       mm0,    round_values             // + 128
   1001 
   1002         paddw       mm2,    round_values             // + 128
   1003         psrlw       mm0,    8
   1004 
   1005         psrlw       mm2,    8
   1006         packuswb    mm0,    mm2                     // des [1]
   1007 
   1008         movq        QWORD ptr [esi+ecx], mm0        // write des[1]
   1009         movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
   1010 
   1011 
   1012 
   1013         // mm1, mm3 --- Src[1]
   1014         // mm0 --- Src[2]
   1015         // mm7 for unpacking
   1016 
   1017         movq        mm4,    mm1                     // b low
   1018         pmullw      mm1,    four_fifths              // b * 4/5 low
   1019 
   1020         movq        QWORD ptr [edi+ecx], mm0        // write des[4]
   1021 
   1022         movq        mm5,    mm3                     // b high
   1023         pmullw      mm3,    four_fifths              // b * 4/5 high
   1024 
   1025         movq        mm2,    mm0                     // c
   1026         pmullw      mm4,    one_fifth                // b * 1/5
   1027 
   1028         punpcklbw   mm0,    mm7                     // c low
   1029         pmullw      mm5,    one_fifth                // b * 1/5
   1030 
   1031         movq        mm6,    mm0                     // make copy of c low
   1032         punpckhbw   mm2,    mm7                     // c high
   1033 
   1034         pmullw      mm6,    one_fifth                // c * 1/5 low
   1035         movq        mm7,    mm2                     // make copy of c high
   1036 
   1037         pmullw      mm7,    one_fifth                // c * 1/5 high
   1038         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
   1039 
   1040         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
   1041         movq        mm6,    mm0                     // make copy of c low
   1042 
   1043         pmullw      mm6,    four_fifths              // c * 4/5 low
   1044         movq        mm7,    mm2                     // make copy of c high
   1045 
   1046         pmullw      mm7,    four_fifths              // c * 4/5 high
   1047 
   1048         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
   1049         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
   1050 
   1051         paddw       mm1,    round_values             // + 128
   1052         paddw       mm3,    round_values             // + 128
   1053 
   1054         psrlw       mm1,    8
   1055         psrlw       mm3,    8
   1056 
   1057         packuswb    mm1,    mm3                     // des[2]
   1058         movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
   1059 
   1060         paddw       mm4,    round_values             // + 128
   1061         paddw       mm5,    round_values             // + 128
   1062 
   1063         psrlw       mm4,    8
   1064         psrlw       mm5,    8
   1065 
   1066         packuswb    mm4,    mm5                     // des[3]
   1067         movq        QWORD ptr [edi], mm4            // write des[3]
   1068 
   1069         //  mm0, mm2 --- Src[3]
   1070 
   1071         add         edi,    8
   1072         add         esi,    8
   1073 
   1074         sub         edx,    8
   1075         jg          last_vs_3_5_loop
   1076     }
   1077 }
   1078 
   1079 /****************************************************************************
   1080  *
   1081  *  ROUTINE       : vertical_band_1_2_scale_mmx
   1082  *
   1083  *  INPUTS        : unsigned char *dest    :
   1084  *                  unsigned int dest_pitch :
   1085  *                  unsigned int dest_width :
   1086  *
   1087  *  OUTPUTS       : None.
   1088  *
   1089  *  RETURNS       : void
   1090  *
   1091  *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
   1092  *
   1093  *  SPECIAL NOTES : The routine uses the first line of the band below
   1094  *                  the current band. The function also has an "C" only
   1095  *                  version.
   1096  *
   1097  ****************************************************************************/
   1098 static
   1099 void vertical_band_1_2_scale_mmx
   1100 (
   1101     unsigned char *dest,
   1102     unsigned int dest_pitch,
   1103     unsigned int dest_width
   1104 )
   1105 {
   1106     __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
   1107 
   1108     __asm
   1109     {
   1110 
   1111         mov         esi,    dest                    // Get the source and destination pointer
   1112         mov         ecx,    dest_pitch               // Get the pitch size
   1113 
   1114         pxor        mm7,    mm7                     // clear out mm7
   1115         mov         edx,    dest_width               // Loop counter
   1116 
   1117         vs_1_2_loop:
   1118 
   1119         movq        mm0,    [esi]                   // get Src[0]
   1120         movq        mm1,    [esi + ecx * 2]         // get Src[1]
   1121 
   1122         movq        mm2,    mm0                     // make copy before unpack
   1123         movq        mm3,    mm1                     // make copy before unpack
   1124 
   1125         punpcklbw   mm0,    mm7                     // low Src[0]
   1126         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
   1127 
   1128         punpcklbw   mm1,    mm7                     // low Src[1]
   1129         paddw       mm0,    mm1                     // low (a + b)
   1130 
   1131         punpckhbw   mm2,    mm7                     // high Src[0]
   1132         paddw       mm0,    mm6                     // low (a + b + 1)
   1133 
   1134         punpckhbw   mm3,    mm7
   1135         paddw       mm2,    mm3                     // high (a + b )
   1136 
   1137         psraw       mm0,    1                       // low (a + b +1 )/2
   1138         paddw       mm2,    mm6                     // high (a + b + 1)
   1139 
   1140         psraw       mm2,    1                       // high (a + b + 1)/2
   1141         packuswb    mm0,    mm2                     // pack results
   1142 
   1143         movq        [esi+ecx], mm0                  // write out eight bytes
   1144         add         esi,    8
   1145 
   1146         sub         edx,    8
   1147         jg          vs_1_2_loop
   1148     }
   1149 
   1150 }
   1151 
   1152 /****************************************************************************
   1153  *
   1154  *  ROUTINE       : last_vertical_band_1_2_scale_mmx
   1155  *
   1156  *  INPUTS        : unsigned char *dest    :
   1157  *                  unsigned int dest_pitch :
   1158  *                  unsigned int dest_width :
   1159  *
   1160  *  OUTPUTS       : None.
   1161  *
   1162  *  RETURNS       : void
   1163  *
   1164  *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
   1165  *
   1166  *  SPECIAL NOTES : The routine uses the first line of the band below
   1167  *                  the current band. The function also has an "C" only
   1168  *                  version.
   1169  *
   1170  ****************************************************************************/
   1171 static
   1172 void last_vertical_band_1_2_scale_mmx
   1173 (
   1174     unsigned char *dest,
   1175     unsigned int dest_pitch,
   1176     unsigned int dest_width
   1177 )
   1178 {
   1179     __asm
   1180     {
   1181         mov         esi,    dest                    // Get the source and destination pointer
   1182         mov         ecx,    dest_pitch               // Get the pitch size
   1183 
   1184         mov         edx,    dest_width               // Loop counter
   1185 
   1186         last_vs_1_2_loop:
   1187 
   1188         movq        mm0,    [esi]                   // get Src[0]
   1189         movq        [esi+ecx], mm0                  // write out eight bytes
   1190 
   1191         add         esi,    8
   1192         sub         edx,    8
   1193 
   1194         jg         last_vs_1_2_loop
   1195     }
   1196 }
   1197 
   1198 /****************************************************************************
   1199  *
   1200  *  ROUTINE       : horizontal_line_1_2_scale
   1201  *
   1202  *  INPUTS        : const unsigned char *source :
   1203  *                  unsigned int source_width    :
   1204  *                  unsigned char *dest         :
   1205  *                  unsigned int dest_width      :
   1206  *
   1207  *  OUTPUTS       : None.
   1208  *
   1209  *  RETURNS       : void
   1210  *
   1211  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1212  *
   1213  *  SPECIAL NOTES : None.
   1214  *
   1215  ****************************************************************************/
   1216 static
   1217 void horizontal_line_1_2_scale_mmx
   1218 (
   1219     const unsigned char *source,
   1220     unsigned int source_width,
   1221     unsigned char *dest,
   1222     unsigned int dest_width
   1223 )
   1224 {
   1225     __declspec(align(16))unsigned short four_ones[] = { 1, 1, 1, 1};
   1226 
   1227     (void) dest_width;
   1228 
   1229     __asm
   1230     {
   1231         mov         esi,    source
   1232         mov         edi,    dest
   1233 
   1234         pxor        mm7,    mm7
   1235         movq        mm6,    four_ones
   1236 
   1237         mov         ecx,    source_width
   1238 
   1239         hs_1_2_loop:
   1240 
   1241         movq        mm0,    [esi]
   1242         movq        mm1,    [esi+1]
   1243 
   1244         movq        mm2,    mm0
   1245         movq        mm3,    mm1
   1246 
   1247         movq        mm4,    mm0
   1248         punpcklbw   mm0,    mm7
   1249 
   1250         punpcklbw   mm1,    mm7
   1251         paddw       mm0,    mm1
   1252 
   1253         paddw       mm0,    mm6
   1254         punpckhbw   mm2,    mm7
   1255 
   1256         punpckhbw   mm3,    mm7
   1257         paddw       mm2,    mm3
   1258 
   1259         paddw       mm2,    mm6
   1260         psraw       mm0,    1
   1261 
   1262         psraw       mm2,    1
   1263         packuswb    mm0,    mm2
   1264 
   1265         movq        mm2,    mm4
   1266         punpcklbw   mm2,    mm0
   1267 
   1268         movq        [edi],  mm2
   1269         punpckhbw   mm4,    mm0
   1270 
   1271         movq        [edi+8], mm4
   1272         add         esi,    8
   1273 
   1274         add         edi,    16
   1275         sub         ecx,    8
   1276 
   1277         cmp         ecx,    8
   1278         jg          hs_1_2_loop
   1279 
   1280 // last eight pixel
   1281 
   1282         movq        mm0,    [esi]
   1283         movq        mm1,    mm0
   1284 
   1285         movq        mm2,    mm0
   1286         movq        mm3,    mm1
   1287 
   1288         psrlq       mm1,    8
   1289         psrlq       mm3,    56
   1290 
   1291         psllq       mm3,    56
   1292         por         mm1,    mm3
   1293 
   1294         movq        mm3,    mm1
   1295         movq        mm4,    mm0
   1296 
   1297         punpcklbw   mm0,    mm7
   1298         punpcklbw   mm1,    mm7
   1299 
   1300         paddw       mm0,    mm1
   1301         paddw       mm0,    mm6
   1302 
   1303         punpckhbw   mm2,    mm7
   1304         punpckhbw   mm3,    mm7
   1305 
   1306         paddw       mm2,    mm3
   1307         paddw       mm2,    mm6
   1308 
   1309         psraw       mm0,    1
   1310         psraw       mm2,    1
   1311 
   1312         packuswb    mm0,    mm2
   1313         movq        mm2,    mm4
   1314 
   1315         punpcklbw   mm2,    mm0
   1316         movq        [edi],  mm2
   1317 
   1318         punpckhbw   mm4,    mm0
   1319         movq        [edi+8], mm4
   1320     }
   1321 }
   1322 
   1323 
   1324 
   1325 
   1326 
   1327 
   1328 /****************************************************************************
   1329  *
   1330  *  ROUTINE       : horizontal_line_5_4_scale_mmx
   1331  *
   1332  *  INPUTS        : const unsigned char *source : Pointer to source data.
   1333  *                  unsigned int source_width    : Stride of source.
   1334  *                  unsigned char *dest         : Pointer to destination data.
   1335  *                  unsigned int dest_width      : Stride of destination (NOT USED).
   1336  *
   1337  *  OUTPUTS       : None.
   1338  *
   1339  *  RETURNS       : void
   1340  *
   1341  *  FUNCTION      : Copies horizontal line of pixels from source to
   1342  *                  destination scaling up by 4 to 5.
   1343  *
   1344  *  SPECIAL NOTES : None.
   1345  *
   1346  ****************************************************************************/
   1347 static
   1348 void horizontal_line_5_4_scale_mmx
   1349 (
   1350     const unsigned char *source,
   1351     unsigned int source_width,
   1352     unsigned char *dest,
   1353     unsigned int dest_width
   1354 )
   1355 {
   1356 
   1357     __declspec(align(16)) const unsigned short const54_2[] = {  0,  64, 128, 192 };
   1358     __declspec(align(16)) const unsigned short const54_1[] = {256, 192, 128,  64 };
   1359     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
   1360     /*
   1361     unsigned i;
   1362     unsigned int a, b, c, d, e;
   1363     unsigned char *des = dest;
   1364     const unsigned char *src = source;
   1365 
   1366     (void) dest_width;
   1367 
   1368     for ( i=0; i<source_width; i+=5 )
   1369     {
   1370         a = src[0];
   1371         b = src[1];
   1372         c = src[2];
   1373         d = src[3];
   1374         e = src[4];
   1375 
   1376         des[0] = a;
   1377         des[1] = ((b*192 + c* 64 + 128)>>8);
   1378         des[2] = ((c*128 + d*128 + 128)>>8);
   1379         des[3] = ((d* 64 + e*192 + 128)>>8);
   1380 
   1381         src += 5;
   1382         des += 4;
   1383     }
   1384     */
   1385     __asm
   1386     {
   1387 
   1388         mov         esi,        source              ;
   1389         mov         edi,        dest                ;
   1390 
   1391         mov         ecx,        source_width         ;
   1392         movq        mm5,        const54_1           ;
   1393 
   1394         pxor        mm7,        mm7                 ;
   1395         movq        mm6,        const54_2           ;
   1396 
   1397         movq        mm4,        round_values         ;
   1398         lea         edx,        [esi+ecx]           ;
   1399         horizontal_line_5_4_loop:
   1400 
   1401         movq        mm0,        QWORD PTR  [esi]    ;
   1402         00 01 02 03 04 05 06 07
   1403         movq        mm1,        mm0                 ;
   1404         00 01 02 03 04 05 06 07
   1405 
   1406         psrlq       mm0,        8                   ;
   1407         01 02 03 04 05 06 07 xx
   1408         punpcklbw   mm1,        mm7                 ;
   1409         xx 00 xx 01 xx 02 xx 03
   1410 
   1411         punpcklbw   mm0,        mm7                 ;
   1412         xx 01 xx 02 xx 03 xx 04
   1413         pmullw      mm1,        mm5
   1414 
   1415         pmullw      mm0,        mm6
   1416         add         esi,        5
   1417 
   1418         add         edi,        4
   1419         paddw       mm1,        mm0
   1420 
   1421         paddw       mm1,        mm4
   1422         psrlw       mm1,        8
   1423 
   1424         cmp         esi,        edx
   1425         packuswb    mm1,        mm7
   1426 
   1427         movd        DWORD PTR [edi-4], mm1
   1428 
   1429         jl          horizontal_line_5_4_loop
   1430 
   1431     }
   1432 
   1433 }
   1434 
   1435 static
   1436 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1437 {
   1438 
   1439     __declspec(align(16)) const unsigned short one_fourths[]   = {  64,  64,  64, 64  };
   1440     __declspec(align(16)) const unsigned short two_fourths[]   = { 128, 128, 128, 128 };
   1441     __declspec(align(16)) const unsigned short three_fourths[] = { 192, 192, 192, 192 };
   1442 
   1443     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
   1444     __asm
   1445     {
   1446         push        ebx
   1447 
   1448         mov         esi,    source                    // Get the source and destination pointer
   1449         mov         ecx,    src_pitch               // Get the pitch size
   1450 
   1451         mov         edi,    dest                    // tow lines below
   1452         pxor        mm7,    mm7                     // clear out mm7
   1453 
   1454         mov         edx,    dest_pitch               // Loop counter
   1455         mov         ebx,    dest_width
   1456 
   1457         vs_5_4_loop:
   1458 
   1459         movd        mm0,    DWORD ptr [esi]         // src[0];
   1460         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
   1461 
   1462         movd        mm2,    DWORD ptr [esi+ecx*2]
   1463         lea         eax,    [esi+ecx*2]             //
   1464 
   1465         punpcklbw   mm1,    mm7
   1466         punpcklbw   mm2,    mm7
   1467 
   1468         movq        mm3,    mm2
   1469         pmullw      mm1,    three_fourths
   1470 
   1471         pmullw      mm2,    one_fourths
   1472         movd        mm4,    [eax+ecx]
   1473 
   1474         pmullw      mm3,    two_fourths
   1475         punpcklbw   mm4,    mm7
   1476 
   1477         movq        mm5,    mm4
   1478         pmullw      mm4,    two_fourths
   1479 
   1480         paddw       mm1,    mm2
   1481         movd        mm6,    [eax+ecx*2]
   1482 
   1483         pmullw      mm5,    one_fourths
   1484         paddw       mm1,    round_values;
   1485 
   1486         paddw       mm3,    mm4
   1487         psrlw       mm1,    8
   1488 
   1489         punpcklbw   mm6,    mm7
   1490         paddw       mm3,    round_values
   1491 
   1492         pmullw      mm6,    three_fourths
   1493         psrlw       mm3,    8
   1494 
   1495         packuswb    mm1,    mm7
   1496         packuswb    mm3,    mm7
   1497 
   1498         movd        DWORD PTR [edi], mm0
   1499         movd        DWORD PTR [edi+edx], mm1
   1500 
   1501 
   1502         paddw       mm5,    mm6
   1503         movd        DWORD PTR [edi+edx*2], mm3
   1504 
   1505         lea         eax,    [edi+edx*2]
   1506         paddw       mm5,    round_values
   1507 
   1508         psrlw       mm5,    8
   1509         add         edi,    4
   1510 
   1511         packuswb    mm5,    mm7
   1512         movd        DWORD PTR [eax+edx], mm5
   1513 
   1514         add         esi,    4
   1515         sub         ebx,    4
   1516 
   1517         jg         vs_5_4_loop
   1518 
   1519         pop         ebx
   1520     }
   1521 }
   1522 
   1523 
   1524 
   1525 static
   1526 void horizontal_line_5_3_scale_mmx
   1527 (
   1528     const unsigned char *source,
   1529     unsigned int source_width,
   1530     unsigned char *dest,
   1531     unsigned int dest_width
   1532 )
   1533 {
   1534     __declspec(align(16)) const unsigned short const53_1[] = {  0,  85, 171, 0 };
   1535     __declspec(align(16)) const unsigned short const53_2[] = {256, 171,  85, 0 };
   1536     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
   1537     __asm
   1538     {
   1539 
   1540         mov         esi,        source              ;
   1541         mov         edi,        dest                ;
   1542 
   1543         mov         ecx,        source_width         ;
   1544         movq        mm5,        const53_1           ;
   1545 
   1546         pxor        mm7,        mm7                 ;
   1547         movq        mm6,        const53_2           ;
   1548 
   1549         movq        mm4,        round_values         ;
   1550         lea         edx,        [esi+ecx-5]         ;
   1551         horizontal_line_5_3_loop:
   1552 
   1553         movq        mm0,        QWORD PTR  [esi]    ;
   1554         00 01 02 03 04 05 06 07
   1555         movq        mm1,        mm0                 ;
   1556         00 01 02 03 04 05 06 07
   1557 
   1558         psllw       mm0,        8                   ;
   1559         xx 00 xx 02 xx 04 xx 06
   1560         psrlw       mm1,        8                   ;
   1561         01 xx 03 xx 05 xx 07 xx
   1562 
   1563         psrlw       mm0,        8                   ;
   1564         00 xx 02 xx 04 xx 06 xx
   1565         psllq       mm1,        16                  ;
   1566         xx xx 01 xx 03 xx 05 xx
   1567 
   1568         pmullw      mm0,        mm6
   1569 
   1570         pmullw      mm1,        mm5
   1571         add         esi,        5
   1572 
   1573         add         edi,        3
   1574         paddw       mm1,        mm0
   1575 
   1576         paddw       mm1,        mm4
   1577         psrlw       mm1,        8
   1578 
   1579         cmp         esi,        edx
   1580         packuswb    mm1,        mm7
   1581 
   1582         movd        DWORD PTR [edi-3], mm1
   1583         jl          horizontal_line_5_3_loop
   1584 
   1585 //exit condition
   1586         movq        mm0,        QWORD PTR  [esi]    ;
   1587         00 01 02 03 04 05 06 07
   1588         movq        mm1,        mm0                 ;
   1589         00 01 02 03 04 05 06 07
   1590 
   1591         psllw       mm0,        8                   ;
   1592         xx 00 xx 02 xx 04 xx 06
   1593         psrlw       mm1,        8                   ;
   1594         01 xx 03 xx 05 xx 07 xx
   1595 
   1596         psrlw       mm0,        8                   ;
   1597         00 xx 02 xx 04 xx 06 xx
   1598         psllq       mm1,        16                  ;
   1599         xx xx 01 xx 03 xx 05 xx
   1600 
   1601         pmullw      mm0,        mm6
   1602 
   1603         pmullw      mm1,        mm5
   1604         paddw       mm1,        mm0
   1605 
   1606         paddw       mm1,        mm4
   1607         psrlw       mm1,        8
   1608 
   1609         packuswb    mm1,        mm7
   1610         movd        eax,        mm1
   1611 
   1612         mov         edx,        eax
   1613         shr         edx,        16
   1614 
   1615         mov         WORD PTR[edi],   ax
   1616         mov         BYTE PTR[edi+2], dl
   1617 
   1618     }
   1619 
   1620 }
   1621 
   1622 
   1623 static
   1624 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1625 {
   1626     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
   1627     __declspec(align(16)) const unsigned short one_thirds[] = {  85,  85,  85,  85 };
   1628     __declspec(align(16)) const unsigned short two_thirds[] = { 171, 171, 171, 171 };
   1629 
   1630     __asm
   1631     {
   1632         push        ebx
   1633 
   1634         mov         esi,    source                    // Get the source and destination pointer
   1635         mov         ecx,    src_pitch               // Get the pitch size
   1636 
   1637         mov         edi,    dest                    // tow lines below
   1638         pxor        mm7,    mm7                     // clear out mm7
   1639 
   1640         mov         edx,    dest_pitch               // Loop counter
   1641         movq        mm5,    one_thirds
   1642 
   1643         movq        mm6,    two_thirds
   1644         mov         ebx,    dest_width;
   1645 
   1646         vs_5_3_loop:
   1647 
   1648         movd        mm0,    DWORD ptr [esi]         // src[0];
   1649         movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
   1650 
   1651         movd        mm2,    DWORD ptr [esi+ecx*2]
   1652         lea         eax,    [esi+ecx*2]             //
   1653 
   1654         punpcklbw   mm1,    mm7
   1655         punpcklbw   mm2,    mm7
   1656 
   1657         pmullw      mm1,    mm5
   1658         pmullw      mm2,    mm6
   1659 
   1660         movd        mm3,    DWORD ptr [eax+ecx]
   1661         movd        mm4,    DWORD ptr [eax+ecx*2]
   1662 
   1663         punpcklbw   mm3,    mm7
   1664         punpcklbw   mm4,    mm7
   1665 
   1666         pmullw      mm3,    mm6
   1667         pmullw      mm4,    mm5
   1668 
   1669 
   1670         movd        DWORD PTR [edi], mm0
   1671         paddw       mm1,    mm2
   1672 
   1673         paddw       mm1,    round_values
   1674         psrlw       mm1,    8
   1675 
   1676         packuswb    mm1,    mm7
   1677         paddw       mm3,    mm4
   1678 
   1679         paddw       mm3,    round_values
   1680         movd        DWORD PTR [edi+edx], mm1
   1681 
   1682         psrlw       mm3,    8
   1683         packuswb    mm3,    mm7
   1684 
   1685         movd        DWORD PTR [edi+edx*2], mm3
   1686 
   1687 
   1688         add         edi,    4
   1689         add         esi,    4
   1690 
   1691         sub         ebx,    4
   1692         jg          vs_5_3_loop
   1693 
   1694         pop         ebx
   1695     }
   1696 }
   1697 
   1698 
   1699 
   1700 
   1701 /****************************************************************************
   1702  *
   1703  *  ROUTINE       : horizontal_line_2_1_scale
   1704  *
   1705  *  INPUTS        : const unsigned char *source :
   1706  *                  unsigned int source_width    :
   1707  *                  unsigned char *dest         :
   1708  *                  unsigned int dest_width      :
   1709  *
   1710  *  OUTPUTS       : None.
   1711  *
   1712  *  RETURNS       : void
   1713  *
   1714  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1715  *
   1716  *  SPECIAL NOTES : None.
   1717  *
   1718  ****************************************************************************/
   1719 static
   1720 void horizontal_line_2_1_scale_mmx
   1721 (
   1722     const unsigned char *source,
   1723     unsigned int source_width,
   1724     unsigned char *dest,
   1725     unsigned int dest_width
   1726 )
   1727 {
   1728     (void) dest_width;
   1729 
   1730     __asm
   1731     {
   1732         mov         esi,    source
   1733         mov         edi,    dest
   1734 
   1735         pxor        mm7,    mm7
   1736         mov         ecx,    dest_width
   1737 
   1738         xor         edx,    edx
   1739         hs_2_1_loop:
   1740 
   1741         movq        mm0,    [esi+edx*2]
   1742         psllw       mm0,    8
   1743 
   1744         psrlw       mm0,    8
   1745         packuswb    mm0,    mm7
   1746 
   1747         movd        DWORD Ptr [edi+edx], mm0;
   1748         add         edx,    4
   1749 
   1750         cmp         edx,    ecx
   1751         jl          hs_2_1_loop
   1752 
   1753     }
   1754 }
   1755 
   1756 
   1757 
   1758 static
   1759 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1760 {
   1761     vpx_memcpy(dest, source, dest_width);
   1762 }
   1763 
   1764 
   1765 
   1766 static
   1767 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
   1768 {
   1769 
   1770     __declspec(align(16)) const unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
   1771     __declspec(align(16)) const unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
   1772     __declspec(align(16)) unsigned short round_values[] = { 128, 128, 128, 128 };
   1773     __asm
   1774     {
   1775         mov         esi,        source
   1776         mov         edi,        dest
   1777 
   1778         mov         eax,        src_pitch
   1779         mov         edx,        dest_width
   1780 
   1781         pxor        mm7,        mm7
   1782         sub         esi,        eax             //back one line
   1783 
   1784 
   1785         lea         ecx,        [esi+edx];
   1786         movq        mm6,        round_values;
   1787 
   1788         movq        mm5,        three_sixteenths;
   1789         movq        mm4,        ten_sixteenths;
   1790 
   1791         vs_2_1_i_loop:
   1792         movd        mm0,        [esi]           //
   1793         movd        mm1,        [esi+eax]       //
   1794 
   1795         movd        mm2,        [esi+eax*2]     //
   1796         punpcklbw   mm0,        mm7
   1797 
   1798         pmullw      mm0,        mm5
   1799         punpcklbw   mm1,        mm7
   1800 
   1801         pmullw      mm1,        mm4
   1802         punpcklbw   mm2,        mm7
   1803 
   1804         pmullw      mm2,        mm5
   1805         paddw       mm0,        round_values
   1806 
   1807         paddw       mm1,        mm2
   1808         paddw       mm0,        mm1
   1809 
   1810         psrlw       mm0,        8
   1811         packuswb    mm0,        mm7
   1812 
   1813         movd        DWORD PTR [edi],        mm0
   1814         add         esi,        4
   1815 
   1816         add         edi,        4;
   1817         cmp         esi,        ecx
   1818         jl          vs_2_1_i_loop
   1819 
   1820     }
   1821 }
   1822 
   1823 void
   1824 register_mmxscalers(void)
   1825 {
   1826     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
   1827     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
   1828     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
   1829     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
   1830     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
   1831     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
   1832     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
   1833     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
   1834     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
   1835 
   1836     vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
   1837     vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
   1838     vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
   1839     vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
   1840     vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
   1841     vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
   1842 
   1843 
   1844 
   1845     vp8_vertical_band_5_4_scale          = vertical_band_5_4_scale_mmx;
   1846     vp8_vertical_band_5_3_scale          = vertical_band_5_3_scale_mmx;
   1847     vp8_vertical_band_2_1_scale          = vertical_band_2_1_scale_mmx;
   1848     vp8_vertical_band_2_1_scale_i        = vertical_band_2_1_scale_i_mmx;
   1849     vp8_horizontal_line_2_1_scale        = horizontal_line_2_1_scale_mmx;
   1850     vp8_horizontal_line_5_3_scale        = horizontal_line_5_3_scale_mmx;
   1851     vp8_horizontal_line_5_4_scale        = horizontal_line_5_4_scale_mmx;
   1852 
   1853 }
   1854