Home | History | Annotate | Download | only in x86_64
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 /****************************************************************************
     13 *
     14 *   Module Title :     scaleopt.cpp
     15 *
     16 *   Description  :     Optimized scaling functions
     17 *
     18 ****************************************************************************/
     19 #include "pragmas.h"
     20 
     21 
     22 
     23 /****************************************************************************
     24 *  Module Statics
     25 ****************************************************************************/
     26 __declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
     27 __declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
     28 __declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
     29 __declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
     30 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
     31 __declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
     32 __declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
     33 __declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
     34 __declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
     35 __declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
     36 __declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
     37 
     38 
     39 
     40 #include "vpx_scale/vpxscale.h"
     41 #include "vpx_mem/vpx_mem.h"
     42 
     43 /****************************************************************************
     44 *
     45 *  ROUTINE       : horizontal_line_3_5_scale_mmx
     46 *
     47 *  INPUTS        : const unsigned char *source :
     48 *                  unsigned int source_width    :
     49 *                  unsigned char *dest         :
     50 *                  unsigned int dest_width      :
     51 *
     52 *  OUTPUTS       : None.
     53 *
     54 *  RETURNS       : void
     55 *
     56 *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
     57 *
     58 *  SPECIAL NOTES : None.
     59 *
     60 ****************************************************************************/
     61 static
     62 void horizontal_line_3_5_scale_mmx
     63 (
     64     const unsigned char *source,
     65     unsigned int source_width,
     66     unsigned char *dest,
     67     unsigned int dest_width
     68 )
     69 {
     70     (void) dest_width;
     71 
     72     __asm
     73     {
     74 
     75         push        rbx
     76 
     77         mov         rsi,    source
     78         mov         rdi,    dest
     79 
     80         mov         ecx,    source_width
     81         lea         rdx,    [rsi+rcx-3];
     82 
     83         movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
     84         movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
     85 
     86         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
     87         pxor        mm7,    mm7             // clear mm7
     88 
     89         horiz_line_3_5_loop:
     90 
     91         mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
     92         mov         ebx,    eax
     93 
     94         and         ebx,    0xffff00        // ebx = xx 01 02 xx
     95         mov         ecx,    eax             // ecx = 00 01 02 03
     96 
     97         and         eax,    0xffff0000      // eax = xx xx 02 03
     98         xor         ecx,    eax             // ecx = 00 01 xx xx
     99 
    100         shr         ebx,    8               // ebx = 01 02 xx xx
    101         or          eax,    ebx             // eax = 01 02 02 03
    102 
    103         shl         ebx,    16              // ebx = xx xx 01 02
    104         movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
    105 
    106         or          ebx,    ecx             // ebx = 00 01 01 02
    107         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
    108 
    109         movd        mm0,    ebx             // mm0 = 00 01 01 02
    110         pmullw      mm1,    mm6             //
    111 
    112         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    113         pmullw      mm0,    mm5             //
    114 
    115         mov         [rdi],  ebx             // writeoutput 00 xx xx xx
    116         add         rsi,    3
    117 
    118         add         rdi,    5
    119         paddw       mm0,    mm1
    120 
    121         paddw       mm0,    mm4
    122         psrlw       mm0,    8
    123 
    124         cmp         rsi,    rdx
    125         packuswb    mm0,    mm7
    126 
    127         movd        DWORD Ptr [rdi-4], mm0
    128         jl          horiz_line_3_5_loop
    129 
    130 //Exit:
    131         mov         eax,    DWORD PTR [rsi] // eax = 00 01 02 03
    132         mov         ebx,    eax
    133 
    134         and         ebx,    0xffff00        // ebx = xx 01 02 xx
    135         mov         ecx,    eax             // ecx = 00 01 02 03
    136 
    137         and         eax,    0xffff0000      // eax = xx xx 02 03
    138         xor         ecx,    eax             // ecx = 00 01 xx xx
    139 
    140         shr         ebx,    8               // ebx = 01 02 xx xx
    141         or          eax,    ebx             // eax = 01 02 02 03
    142 
    143         shl         eax,    8               // eax = xx 01 02 02
    144         and         eax,    0xffff0000      // eax = xx xx 02 02
    145 
    146         or          eax,    ebx             // eax = 01 02 02 02
    147 
    148         shl         ebx,    16              // ebx = xx xx 01 02
    149         movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
    150 
    151         or          ebx,    ecx             // ebx = 00 01 01 02
    152         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
    153 
    154         movd        mm0,    ebx             // mm0 = 00 01 01 02
    155         pmullw      mm1,    mm6             //
    156 
    157         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
    158         pmullw      mm0,    mm5             //
    159 
    160         mov         [rdi],  ebx             // writeoutput 00 xx xx xx
    161         paddw       mm0,    mm1
    162 
    163         paddw       mm0,    mm4
    164         psrlw       mm0,    8
    165 
    166         packuswb    mm0,    mm7
    167         movd        DWORD Ptr [rdi+1], mm0
    168 
    169         pop rbx
    170 
    171     }
    172 
    173 }
    174 
    175 
    176 /****************************************************************************
    177 *
    178 *  ROUTINE       : horizontal_line_4_5_scale_mmx
    179 *
    180 *  INPUTS        : const unsigned char *source :
    181 *                  unsigned int source_width    :
    182 *                  unsigned char *dest         :
    183 *                  unsigned int dest_width      :
    184 *
    185 *  OUTPUTS       : None.
    186 *
    187 *  RETURNS       : void
    188 *
    189 *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
    190 *
    191 *  SPECIAL NOTES : None.
    192 *
    193 ****************************************************************************/
    194 static
    195 void horizontal_line_4_5_scale_mmx
    196 (
    197     const unsigned char *source,
    198     unsigned int source_width,
    199     unsigned char *dest,
    200     unsigned int dest_width
    201 )
    202 {
    203     (void)dest_width;
    204 
    205     __asm
    206     {
    207 
    208         mov         rsi,    source
    209         mov         rdi,    dest
    210 
    211         mov         ecx,    source_width
    212         lea         rdx,    [rsi+rcx-8];
    213 
    214         movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
    215         movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
    216 
    217         movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
    218         pxor        mm7,    mm7             // clear mm7
    219 
    220         horiz_line_4_5_loop:
    221 
    222         movq        mm0,    QWORD PTR [rsi]           // mm0 = 00 01 02 03 04 05 06 07
    223         movq        mm1,    QWORD PTR [rsi+1];        // mm1 = 01 02 03 04 05 06 07 08
    224 
    225         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    226         movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
    227 
    228         movd        DWORD PTR [rdi],  mm0             // write output 00 xx xx xx
    229         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    230 
    231         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    232         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    233 
    234         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    235         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    236 
    237         movd        DWORD PTR [rdi+5], mm2            // write ouput 05 xx xx xx
    238         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    239 
    240         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    241         pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
    242 
    243         paddw       mm0,    mm1             // added round values
    244         paddw       mm0,    mm4
    245 
    246         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    247         packuswb    mm0,    mm7
    248 
    249         movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
    250         add         rdi,    10
    251 
    252         add         rsi,    8
    253         paddw       mm2,    mm3             //
    254 
    255         paddw       mm2,    mm4             // added round values
    256         cmp         rsi,    rdx
    257 
    258         psrlw       mm2,    8
    259         packuswb    mm2,    mm7
    260 
    261         movd        DWORD PTR [rdi-4], mm2 // writeoutput 06 07 08 09
    262         jl         horiz_line_4_5_loop
    263 
    264 //Exit:
    265         movq        mm0,    [rsi]           // mm0 = 00 01 02 03 04 05 06 07
    266         movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
    267 
    268         movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
    269         psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
    270 
    271         movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
    272         pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
    273 
    274         psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
    275         por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
    276 
    277         movq        mm3,    mm1
    278 
    279         movd        DWORD PTR [rdi],  mm0   // write output 00 xx xx xx
    280         punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
    281 
    282         punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
    283         pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
    284 
    285         pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
    286         punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
    287 
    288         movd        DWORD PTR [rdi+5], mm2  // write ouput 05 xx xx xx
    289         pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
    290 
    291         punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
    292         pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
    293 
    294         paddw       mm0,    mm1             // added round values
    295         paddw       mm0,    mm4
    296 
    297         psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
    298         packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
    299 
    300         movd        DWORD PTR [rdi+1], mm0  // write output 01 02 03 04
    301         paddw       mm2,    mm3             //
    302 
    303         paddw       mm2,    mm4             // added round values
    304         psrlw       mm2,    8
    305 
    306         packuswb    mm2,    mm7
    307         movd        DWORD PTR [rdi+6], mm2  // writeoutput 06 07 08 09
    308 
    309 
    310     }
    311 }
    312 
    313 /****************************************************************************
    314 *
    315 *  ROUTINE       : vertical_band_4_5_scale_mmx
    316 *
    317 *  INPUTS        : unsigned char *dest    :
    318 *                  unsigned int dest_pitch :
    319 *                  unsigned int dest_width :
    320 *
    321 *  OUTPUTS       : None.
    322 *
    323 *  RETURNS       : void
    324 *
    325 *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
    326 *
    327 *  SPECIAL NOTES : The routine uses the first line of the band below
    328 *                  the current band. The function also has a "C" only
    329 *                  version.
    330 *
    331 ****************************************************************************/
    332 static
    333 void vertical_band_4_5_scale_mmx
    334 (
    335     unsigned char *dest,
    336     unsigned int dest_pitch,
    337     unsigned int dest_width
    338 )
    339 {
    340     __asm
    341     {
    342 
    343         mov         rsi,    dest                    // Get the source and destination pointer
    344         mov         ecx,    dest_pitch               // Get the pitch size
    345 
    346         lea         rdi,    [rsi+rcx*2]             // tow lines below
    347         add         rdi,    rcx                     // three lines below
    348 
    349         pxor        mm7,    mm7                     // clear out mm7
    350         mov         edx,    dest_width               // Loop counter
    351 
    352         vs_4_5_loop:
    353 
    354         movq        mm0,    QWORD ptr [rsi]         // src[0];
    355         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
    356 
    357         movq        mm2,    mm0                     // Make a copy
    358         punpcklbw   mm0,    mm7                     // unpack low to word
    359 
    360         movq        mm5,    one_fifth
    361         punpckhbw   mm2,    mm7                     // unpack high to word
    362 
    363         pmullw      mm0,    mm5                     // a * 1/5
    364 
    365         movq        mm3,    mm1                     // make a copy
    366         punpcklbw   mm1,    mm7                     // unpack low to word
    367 
    368         pmullw      mm2,    mm5                     // a * 1/5
    369         movq        mm6,    four_fifths               // constan
    370 
    371         movq        mm4,    mm1                     // copy of low b
    372         pmullw      mm4,    mm6                     // b * 4/5
    373 
    374         punpckhbw   mm3,    mm7                     // unpack high to word
    375         movq        mm5,    mm3                     // copy of high b
    376 
    377         pmullw      mm5,    mm6                     // b * 4/5
    378         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    379 
    380         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    381         paddw       mm0,    round_values             // + 128
    382 
    383         paddw       mm2,    round_values             // + 128
    384         psrlw       mm0,    8
    385 
    386         psrlw       mm2,    8
    387         packuswb    mm0,    mm2                     // des [1]
    388 
    389         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
    390         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
    391 
    392         // mm1, mm3 --- Src[1]
    393         // mm0 --- Src[2]
    394         // mm7 for unpacking
    395 
    396         movq        mm5,    two_fifths
    397         movq        mm2,    mm0                     // make a copy
    398 
    399         pmullw      mm1,    mm5                     // b * 2/5
    400         movq        mm6,    three_fifths
    401 
    402 
    403         punpcklbw   mm0,    mm7                     // unpack low to word
    404         pmullw      mm3,    mm5                     // b * 2/5
    405 
    406         movq        mm4,    mm0                     // make copy of c
    407         punpckhbw   mm2,    mm7                     // unpack high to word
    408 
    409         pmullw      mm4,    mm6                     // c * 3/5
    410         movq        mm5,    mm2
    411 
    412         pmullw      mm5,    mm6                     // c * 3/5
    413         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    414 
    415         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    416         paddw       mm1,    round_values             // + 128
    417 
    418         paddw       mm3,    round_values             // + 128
    419         psrlw       mm1,    8
    420 
    421         psrlw       mm3,    8
    422         packuswb    mm1,    mm3                     // des[2]
    423 
    424         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
    425         movq        mm1,    [rdi]                   // mm1=Src[3];
    426 
    427         // mm0, mm2 --- Src[2]
    428         // mm1 --- Src[3]
    429         // mm6 --- 3/5
    430         // mm7 for unpacking
    431 
    432         pmullw      mm0,    mm6                     // c * 3/5
    433         movq        mm5,    two_fifths               // mm5 = 2/5
    434 
    435         movq        mm3,    mm1                     // make a copy
    436         pmullw      mm2,    mm6                     // c * 3/5
    437 
    438         punpcklbw   mm1,    mm7                     // unpack low
    439         movq        mm4,    mm1                     // make a copy
    440 
    441         punpckhbw   mm3,    mm7                     // unpack high
    442         pmullw      mm4,    mm5                     // d * 2/5
    443 
    444         movq        mm6,    mm3                     // make a copy
    445         pmullw      mm6,    mm5                     // d * 2/5
    446 
    447         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    448         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    449 
    450         paddw       mm0,    round_values             // + 128
    451         paddw       mm2,    round_values             // + 128
    452 
    453         psrlw       mm0,    8
    454         psrlw       mm2,    8
    455 
    456         packuswb    mm0,    mm2                     // des[3]
    457         movq        QWORD ptr [rdi], mm0            // write des[3]
    458 
    459         //  mm1, mm3 --- Src[3]
    460         //  mm7 -- cleared for unpacking
    461 
    462         movq        mm0,    [rdi+rcx*2]             // mm0, Src[0] of the next group
    463 
    464         movq        mm5,    four_fifths              // mm5 = 4/5
    465         pmullw      mm1,    mm5                     // d * 4/5
    466 
    467         movq        mm6,    one_fifth                // mm6 = 1/5
    468         movq        mm2,    mm0                     // make a copy
    469 
    470         pmullw      mm3,    mm5                     // d * 4/5
    471         punpcklbw   mm0,    mm7                     // unpack low
    472 
    473         pmullw      mm0,    mm6                     // an * 1/5
    474         punpckhbw   mm2,    mm7                     // unpack high
    475 
    476         paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
    477         pmullw      mm2,    mm6                     // an * 1/5
    478 
    479         paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
    480         paddw       mm1,    round_values             // + 128
    481 
    482         paddw       mm3,    round_values             // + 128
    483         psrlw       mm1,    8
    484 
    485         psrlw       mm3,    8
    486         packuswb    mm1,    mm3                     // des[4]
    487 
    488         movq        QWORD ptr [rdi+rcx], mm1        // write des[4]
    489 
    490         add         rdi,    8
    491         add         rsi,    8
    492 
    493         sub         rdx,    8
    494         jg          vs_4_5_loop
    495     }
    496 }
    497 
    498 /****************************************************************************
    499 *
    500 *  ROUTINE       : last_vertical_band_4_5_scale_mmx
    501 *
    502 *  INPUTS        : unsigned char *dest    :
    503 *                  unsigned int dest_pitch :
    504 *                  unsigned int dest_width :
    505 *
    506 *  OUTPUTS       : None.
    507 *
    508 *  RETURNS       : None
    509 *
    510 *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
    511 *
    512 *  SPECIAL NOTES : The routine uses the first line of the band below
    513 *                  the current band. The function also has an "C" only
    514 *                  version.
    515 *
    516 ****************************************************************************/
    517 static
    518 void last_vertical_band_4_5_scale_mmx
    519 (
    520     unsigned char *dest,
    521     unsigned int dest_pitch,
    522     unsigned int dest_width
    523 )
    524 {
    525     __asm
    526     {
    527         mov         rsi,    dest                    // Get the source and destination pointer
    528         mov         ecx,    dest_pitch               // Get the pitch size
    529 
    530         lea         rdi,    [rsi+rcx*2]             // tow lines below
    531         add         rdi,    rcx                     // three lines below
    532 
    533         pxor        mm7,    mm7                     // clear out mm7
    534         mov         edx,    dest_width               // Loop counter
    535 
    536         last_vs_4_5_loop:
    537 
    538         movq        mm0,    QWORD ptr [rsi]         // src[0];
    539         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
    540 
    541         movq        mm2,    mm0                     // Make a copy
    542         punpcklbw   mm0,    mm7                     // unpack low to word
    543 
    544         movq        mm5,    one_fifth
    545         punpckhbw   mm2,    mm7                     // unpack high to word
    546 
    547         pmullw      mm0,    mm5                     // a * 1/5
    548 
    549         movq        mm3,    mm1                     // make a copy
    550         punpcklbw   mm1,    mm7                     // unpack low to word
    551 
    552         pmullw      mm2,    mm5                     // a * 1/5
    553         movq        mm6,    four_fifths               // constan
    554 
    555         movq        mm4,    mm1                     // copy of low b
    556         pmullw      mm4,    mm6                     // b * 4/5
    557 
    558         punpckhbw   mm3,    mm7                     // unpack high to word
    559         movq        mm5,    mm3                     // copy of high b
    560 
    561         pmullw      mm5,    mm6                     // b * 4/5
    562         paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
    563 
    564         paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
    565         paddw       mm0,    round_values             // + 128
    566 
    567         paddw       mm2,    round_values             // + 128
    568         psrlw       mm0,    8
    569 
    570         psrlw       mm2,    8
    571         packuswb    mm0,    mm2                     // des [1]
    572 
    573         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
    574         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
    575 
    576         // mm1, mm3 --- Src[1]
    577         // mm0 --- Src[2]
    578         // mm7 for unpacking
    579 
    580         movq        mm5,    two_fifths
    581         movq        mm2,    mm0                     // make a copy
    582 
    583         pmullw      mm1,    mm5                     // b * 2/5
    584         movq        mm6,    three_fifths
    585 
    586 
    587         punpcklbw   mm0,    mm7                     // unpack low to word
    588         pmullw      mm3,    mm5                     // b * 2/5
    589 
    590         movq        mm4,    mm0                     // make copy of c
    591         punpckhbw   mm2,    mm7                     // unpack high to word
    592 
    593         pmullw      mm4,    mm6                     // c * 3/5
    594         movq        mm5,    mm2
    595 
    596         pmullw      mm5,    mm6                     // c * 3/5
    597         paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
    598 
    599         paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
    600         paddw       mm1,    round_values             // + 128
    601 
    602         paddw       mm3,    round_values             // + 128
    603         psrlw       mm1,    8
    604 
    605         psrlw       mm3,    8
    606         packuswb    mm1,    mm3                     // des[2]
    607 
    608         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
    609         movq        mm1,    [rdi]                   // mm1=Src[3];
    610 
    611         movq        QWORD ptr [rdi+rcx], mm1        // write des[4];
    612 
    613         // mm0, mm2 --- Src[2]
    614         // mm1 --- Src[3]
    615         // mm6 --- 3/5
    616         // mm7 for unpacking
    617 
    618         pmullw      mm0,    mm6                     // c * 3/5
    619         movq        mm5,    two_fifths               // mm5 = 2/5
    620 
    621         movq        mm3,    mm1                     // make a copy
    622         pmullw      mm2,    mm6                     // c * 3/5
    623 
    624         punpcklbw   mm1,    mm7                     // unpack low
    625         movq        mm4,    mm1                     // make a copy
    626 
    627         punpckhbw   mm3,    mm7                     // unpack high
    628         pmullw      mm4,    mm5                     // d * 2/5
    629 
    630         movq        mm6,    mm3                     // make a copy
    631         pmullw      mm6,    mm5                     // d * 2/5
    632 
    633         paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
    634         paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
    635 
    636         paddw       mm0,    round_values             // + 128
    637         paddw       mm2,    round_values             // + 128
    638 
    639         psrlw       mm0,    8
    640         psrlw       mm2,    8
    641 
    642         packuswb    mm0,    mm2                     // des[3]
    643         movq        QWORD ptr [rdi], mm0            // write des[3]
    644 
    645         //  mm1, mm3 --- Src[3]
    646         //  mm7 -- cleared for unpacking
    647         add         rdi,    8
    648         add         rsi,    8
    649 
    650         sub         rdx,    8
    651         jg          last_vs_4_5_loop
    652     }
    653 }
    654 
    655 /****************************************************************************
    656 *
    657 *  ROUTINE       : vertical_band_3_5_scale_mmx
    658 *
    659 *  INPUTS        : unsigned char *dest    :
    660 *                  unsigned int dest_pitch :
    661 *                  unsigned int dest_width :
    662 *
    663 *  OUTPUTS       : None.
    664 *
    665 *  RETURNS       : void
    666 *
    667 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    668 *
    669 *  SPECIAL NOTES : The routine uses the first line of the band below
    670 *                  the current band. The function also has an "C" only
    671 *                  version.
    672 *
    673 ****************************************************************************/
    674 static
    675 void vertical_band_3_5_scale_mmx
    676 (
    677     unsigned char *dest,
    678     unsigned int dest_pitch,
    679     unsigned int dest_width
    680 )
    681 {
    682     __asm
    683     {
    684         mov         rsi,    dest                    // Get the source and destination pointer
    685         mov         ecx,    dest_pitch               // Get the pitch size
    686 
    687         lea         rdi,    [rsi+rcx*2]             // two lines below
    688         add         rdi,    rcx                     // three lines below
    689 
    690         pxor        mm7,    mm7                     // clear out mm7
    691         mov         edx,    dest_width               // Loop counter
    692 
    693         vs_3_5_loop:
    694 
    695         movq        mm0,    QWORD ptr [rsi]         // src[0];
    696         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
    697 
    698         movq        mm2,    mm0                     // Make a copy
    699         punpcklbw   mm0,    mm7                     // unpack low to word
    700 
    701         movq        mm5,    two_fifths               // mm5 = 2/5
    702         punpckhbw   mm2,    mm7                     // unpack high to word
    703 
    704         pmullw      mm0,    mm5                     // a * 2/5
    705 
    706         movq        mm3,    mm1                     // make a copy
    707         punpcklbw   mm1,    mm7                     // unpack low to word
    708 
    709         pmullw      mm2,    mm5                     // a * 2/5
    710         movq        mm6,    three_fifths             // mm6 = 3/5
    711 
    712         movq        mm4,    mm1                     // copy of low b
    713         pmullw      mm4,    mm6                     // b * 3/5
    714 
    715         punpckhbw   mm3,    mm7                     // unpack high to word
    716         movq        mm5,    mm3                     // copy of high b
    717 
    718         pmullw      mm5,    mm6                     // b * 3/5
    719         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    720 
    721         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    722         paddw       mm0,    round_values             // + 128
    723 
    724         paddw       mm2,    round_values             // + 128
    725         psrlw       mm0,    8
    726 
    727         psrlw       mm2,    8
    728         packuswb    mm0,    mm2                     // des [1]
    729 
    730         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
    731         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
    732 
    733         // mm1, mm3 --- Src[1]
    734         // mm0 --- Src[2]
    735         // mm7 for unpacking
    736 
    737         movq        mm4,    mm1                     // b low
    738         pmullw      mm1,    four_fifths              // b * 4/5 low
    739 
    740         movq        mm5,    mm3                     // b high
    741         pmullw      mm3,    four_fifths              // b * 4/5 high
    742 
    743         movq        mm2,    mm0                     // c
    744         pmullw      mm4,    one_fifth                // b * 1/5
    745 
    746         punpcklbw   mm0,    mm7                     // c low
    747         pmullw      mm5,    one_fifth                // b * 1/5
    748 
    749         movq        mm6,    mm0                     // make copy of c low
    750         punpckhbw   mm2,    mm7                     // c high
    751 
    752         pmullw      mm6,    one_fifth                // c * 1/5 low
    753         movq        mm7,    mm2                     // make copy of c high
    754 
    755         pmullw      mm7,    one_fifth                // c * 1/5 high
    756         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
    757 
    758         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    759         movq        mm6,    mm0                     // make copy of c low
    760 
    761         pmullw      mm6,    four_fifths              // c * 4/5 low
    762         movq        mm7,    mm2                     // make copy of c high
    763 
    764         pmullw      mm7,    four_fifths              // c * 4/5 high
    765 
    766         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    767         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
    768 
    769         paddw       mm1,    round_values             // + 128
    770         paddw       mm3,    round_values             // + 128
    771 
    772         psrlw       mm1,    8
    773         psrlw       mm3,    8
    774 
    775         packuswb    mm1,    mm3                     // des[2]
    776         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
    777 
    778         paddw       mm4,    round_values             // + 128
    779         paddw       mm5,    round_values             // + 128
    780 
    781         psrlw       mm4,    8
    782         psrlw       mm5,    8
    783 
    784         packuswb    mm4,    mm5                     // des[3]
    785         movq        QWORD ptr [rdi], mm4            // write des[3]
    786 
    787         //  mm0, mm2 --- Src[3]
    788 
    789         pxor        mm7,    mm7                     // clear mm7 for unpacking
    790         movq        mm1,    [rdi+rcx*2]             // mm1 = Src[0] of the next group
    791 
    792         movq        mm5,    three_fifths             // mm5 = 3/5
    793         pmullw      mm0,    mm5                     // d * 3/5
    794 
    795         movq        mm6,    two_fifths                // mm6 = 2/5
    796         movq        mm3,    mm1                     // make a copy
    797 
    798         pmullw      mm2,    mm5                     // d * 3/5
    799         punpcklbw   mm1,    mm7                     // unpack low
    800 
    801         pmullw      mm1,    mm6                     // an * 2/5
    802         punpckhbw   mm3,    mm7                     // unpack high
    803 
    804         paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
    805         pmullw      mm3,    mm6                     // an * 2/5
    806 
    807         paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
    808         paddw       mm0,    round_values             // + 128
    809 
    810         paddw       mm2,    round_values             // + 128
    811         psrlw       mm0,    8
    812 
    813         psrlw       mm2,    8
    814         packuswb    mm0,    mm2                     // des[4]
    815 
    816         movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
    817 
    818         add         rdi,    8
    819         add         rsi,    8
    820 
    821         sub         rdx,    8
    822         jg          vs_3_5_loop
    823     }
    824 }
    825 
    826 /****************************************************************************
    827 *
    828 *  ROUTINE       : last_vertical_band_3_5_scale_mmx
    829 *
    830 *  INPUTS        : unsigned char *dest    :
    831 *                  unsigned int dest_pitch :
    832 *                  unsigned int dest_width :
    833 *
    834 *  OUTPUTS       : None.
    835 *
    836 *  RETURNS       : void
    837 *
    838 *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
    839 *
    840 *  SPECIAL NOTES : The routine uses the first line of the band below
    841 *                  the current band. The function also has an "C" only
    842 *                  version.
    843 *
    844 ****************************************************************************/
    845 static
    846 void last_vertical_band_3_5_scale_mmx
    847 (
    848     unsigned char *dest,
    849     unsigned int dest_pitch,
    850     unsigned int dest_width
    851 )
    852 {
    853     __asm
    854     {
    855         mov         rsi,    dest                    // Get the source and destination pointer
    856         mov         ecx,    dest_pitch               // Get the pitch size
    857 
    858         lea         rdi,    [rsi+rcx*2]             // tow lines below
    859         add         rdi,    rcx                     // three lines below
    860 
    861         pxor        mm7,    mm7                     // clear out mm7
    862         mov         edx,    dest_width               // Loop counter
    863 
    864 
    865         last_vs_3_5_loop:
    866 
    867         movq        mm0,    QWORD ptr [rsi]         // src[0];
    868         movq        mm1,    QWORD ptr [rsi+rcx]     // src[1];
    869 
    870         movq        mm2,    mm0                     // Make a copy
    871         punpcklbw   mm0,    mm7                     // unpack low to word
    872 
    873         movq        mm5,    two_fifths               // mm5 = 2/5
    874         punpckhbw   mm2,    mm7                     // unpack high to word
    875 
    876         pmullw      mm0,    mm5                     // a * 2/5
    877 
    878         movq        mm3,    mm1                     // make a copy
    879         punpcklbw   mm1,    mm7                     // unpack low to word
    880 
    881         pmullw      mm2,    mm5                     // a * 2/5
    882         movq        mm6,    three_fifths             // mm6 = 3/5
    883 
    884         movq        mm4,    mm1                     // copy of low b
    885         pmullw      mm4,    mm6                     // b * 3/5
    886 
    887         punpckhbw   mm3,    mm7                     // unpack high to word
    888         movq        mm5,    mm3                     // copy of high b
    889 
    890         pmullw      mm5,    mm6                     // b * 3/5
    891         paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
    892 
    893         paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
    894         paddw       mm0,    round_values             // + 128
    895 
    896         paddw       mm2,    round_values             // + 128
    897         psrlw       mm0,    8
    898 
    899         psrlw       mm2,    8
    900         packuswb    mm0,    mm2                     // des [1]
    901 
    902         movq        QWORD ptr [rsi+rcx], mm0        // write des[1]
    903         movq        mm0,    [rsi+rcx*2]             // mm0 = src[2]
    904 
    905 
    906 
    907         // mm1, mm3 --- Src[1]
    908         // mm0 --- Src[2]
    909         // mm7 for unpacking
    910 
    911         movq        mm4,    mm1                     // b low
    912         pmullw      mm1,    four_fifths              // b * 4/5 low
    913 
    914         movq        QWORD ptr [rdi+rcx], mm0        // write des[4]
    915 
    916         movq        mm5,    mm3                     // b high
    917         pmullw      mm3,    four_fifths              // b * 4/5 high
    918 
    919         movq        mm2,    mm0                     // c
    920         pmullw      mm4,    one_fifth                // b * 1/5
    921 
    922         punpcklbw   mm0,    mm7                     // c low
    923         pmullw      mm5,    one_fifth                // b * 1/5
    924 
    925         movq        mm6,    mm0                     // make copy of c low
    926         punpckhbw   mm2,    mm7                     // c high
    927 
    928         pmullw      mm6,    one_fifth                // c * 1/5 low
    929         movq        mm7,    mm2                     // make copy of c high
    930 
    931         pmullw      mm7,    one_fifth                // c * 1/5 high
    932         paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
    933 
    934         paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
    935         movq        mm6,    mm0                     // make copy of c low
    936 
    937         pmullw      mm6,    four_fifths              // c * 4/5 low
    938         movq        mm7,    mm2                     // make copy of c high
    939 
    940         pmullw      mm7,    four_fifths              // c * 4/5 high
    941 
    942         paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
    943         paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
    944 
    945         paddw       mm1,    round_values             // + 128
    946         paddw       mm3,    round_values             // + 128
    947 
    948         psrlw       mm1,    8
    949         psrlw       mm3,    8
    950 
    951         packuswb    mm1,    mm3                     // des[2]
    952         movq        QWORD ptr [rsi+rcx*2], mm1      // write des[2]
    953 
    954         paddw       mm4,    round_values             // + 128
    955         paddw       mm5,    round_values             // + 128
    956 
    957         psrlw       mm4,    8
    958         psrlw       mm5,    8
    959 
    960         packuswb    mm4,    mm5                     // des[3]
    961         movq        QWORD ptr [rdi], mm4            // write des[3]
    962 
    963         //  mm0, mm2 --- Src[3]
    964 
    965         add         rdi,    8
    966         add         rsi,    8
    967 
    968         sub         rdx,    8
    969         jg          last_vs_3_5_loop
    970     }
    971 }
    972 
    973 /****************************************************************************
    974 *
    975 *  ROUTINE       : vertical_band_1_2_scale_mmx
    976 *
    977 *  INPUTS        : unsigned char *dest    :
    978 *                  unsigned int dest_pitch :
    979 *                  unsigned int dest_width :
    980 *
    981 *  OUTPUTS       : None.
    982 *
    983 *  RETURNS       : void
    984 *
    985 *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
    986 *
    987 *  SPECIAL NOTES : The routine uses the first line of the band below
    988 *                  the current band. The function also has an "C" only
    989 *                  version.
    990 *
    991 ****************************************************************************/
    992 static
    993 void vertical_band_1_2_scale_mmx
    994 (
    995     unsigned char *dest,
    996     unsigned int dest_pitch,
    997     unsigned int dest_width
    998 )
    999 {
   1000     __asm
   1001     {
   1002 
   1003         mov         rsi,    dest                    // Get the source and destination pointer
   1004         mov         ecx,    dest_pitch               // Get the pitch size
   1005 
   1006         pxor        mm7,    mm7                     // clear out mm7
   1007         mov         edx,    dest_width               // Loop counter
   1008 
   1009         vs_1_2_loop:
   1010 
   1011         movq        mm0,    [rsi]                   // get Src[0]
   1012         movq        mm1,    [rsi + rcx * 2]         // get Src[1]
   1013 
   1014         movq        mm2,    mm0                     // make copy before unpack
   1015         movq        mm3,    mm1                     // make copy before unpack
   1016 
   1017         punpcklbw   mm0,    mm7                     // low Src[0]
   1018         movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
   1019 
   1020         punpcklbw   mm1,    mm7                     // low Src[1]
   1021         paddw       mm0,    mm1                     // low (a + b)
   1022 
   1023         punpckhbw   mm2,    mm7                     // high Src[0]
   1024         paddw       mm0,    mm6                     // low (a + b + 1)
   1025 
   1026         punpckhbw   mm3,    mm7
   1027         paddw       mm2,    mm3                     // high (a + b )
   1028 
   1029         psraw       mm0,    1                       // low (a + b +1 )/2
   1030         paddw       mm2,    mm6                     // high (a + b + 1)
   1031 
   1032         psraw       mm2,    1                       // high (a + b + 1)/2
   1033         packuswb    mm0,    mm2                     // pack results
   1034 
   1035         movq        [rsi+rcx], mm0                  // write out eight bytes
   1036         add         rsi,    8
   1037 
   1038         sub         rdx,    8
   1039         jg          vs_1_2_loop
   1040     }
   1041 
   1042 }
   1043 
   1044 /****************************************************************************
   1045 *
   1046 *  ROUTINE       : last_vertical_band_1_2_scale_mmx
   1047 *
   1048 *  INPUTS        : unsigned char *dest    :
   1049 *                  unsigned int dest_pitch :
   1050 *                  unsigned int dest_width :
   1051 *
   1052 *  OUTPUTS       : None.
   1053 *
   1054 *  RETURNS       : void
   1055 *
   1056 *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
   1057 *
   1058 *  SPECIAL NOTES : The routine uses the first line of the band below
   1059 *                  the current band. The function also has an "C" only
   1060 *                  version.
   1061 *
   1062 ****************************************************************************/
   1063 static
   1064 void last_vertical_band_1_2_scale_mmx
   1065 (
   1066     unsigned char *dest,
   1067     unsigned int dest_pitch,
   1068     unsigned int dest_width
   1069 )
   1070 {
   1071     __asm
   1072     {
   1073         mov         rsi,    dest                    // Get the source and destination pointer
   1074         mov         ecx,    dest_pitch               // Get the pitch size
   1075 
   1076         mov         edx,    dest_width               // Loop counter
   1077 
   1078         last_vs_1_2_loop:
   1079 
   1080         movq        mm0,    [rsi]                   // get Src[0]
   1081         movq        [rsi+rcx], mm0                  // write out eight bytes
   1082 
   1083         add         rsi,    8
   1084         sub         rdx,    8
   1085 
   1086         jg          last_vs_1_2_loop
   1087     }
   1088 }
   1089 
   1090 /****************************************************************************
   1091 *
   1092 *  ROUTINE       : horizontal_line_1_2_scale
   1093 *
   1094 *  INPUTS        : const unsigned char *source :
   1095 *                  unsigned int source_width    :
   1096 *                  unsigned char *dest         :
   1097 *                  unsigned int dest_width      :
   1098 *
   1099 *  OUTPUTS       : None.
   1100 *
   1101 *  RETURNS       : void
   1102 *
   1103 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1104 *
   1105 *  SPECIAL NOTES : None.
   1106 *
   1107 ****************************************************************************/
   1108 static
   1109 void horizontal_line_1_2_scale_mmx
   1110 (
   1111     const unsigned char *source,
   1112     unsigned int source_width,
   1113     unsigned char *dest,
   1114     unsigned int dest_width
   1115 )
   1116 {
   1117     (void) dest_width;
   1118 
   1119     __asm
   1120     {
   1121         mov         rsi,    source
   1122         mov         rdi,    dest
   1123 
   1124         pxor        mm7,    mm7
   1125         movq        mm6,    four_ones
   1126 
   1127         mov         ecx,    source_width
   1128 
   1129         hs_1_2_loop:
   1130 
   1131         movq        mm0,    [rsi]
   1132         movq        mm1,    [rsi+1]
   1133 
   1134         movq        mm2,    mm0
   1135         movq        mm3,    mm1
   1136 
   1137         movq        mm4,    mm0
   1138         punpcklbw   mm0,    mm7
   1139 
   1140         punpcklbw   mm1,    mm7
   1141         paddw       mm0,    mm1
   1142 
   1143         paddw       mm0,    mm6
   1144         punpckhbw   mm2,    mm7
   1145 
   1146         punpckhbw   mm3,    mm7
   1147         paddw       mm2,    mm3
   1148 
   1149         paddw       mm2,    mm6
   1150         psraw       mm0,    1
   1151 
   1152         psraw       mm2,    1
   1153         packuswb    mm0,    mm2
   1154 
   1155         movq        mm2,    mm4
   1156         punpcklbw   mm2,    mm0
   1157 
   1158         movq        [rdi],  mm2
   1159         punpckhbw   mm4,    mm0
   1160 
   1161         movq        [rdi+8], mm4
   1162         add         rsi,    8
   1163 
   1164         add         rdi,    16
   1165         sub         rcx,    8
   1166 
   1167         cmp         rcx,    8
   1168         jg          hs_1_2_loop
   1169 
   1170 // last eight pixel
   1171 
   1172         movq        mm0,    [rsi]
   1173         movq        mm1,    mm0
   1174 
   1175         movq        mm2,    mm0
   1176         movq        mm3,    mm1
   1177 
   1178         psrlq       mm1,    8
   1179         psrlq       mm3,    56
   1180 
   1181         psllq       mm3,    56
   1182         por         mm1,    mm3
   1183 
   1184         movq        mm3,    mm1
   1185         movq        mm4,    mm0
   1186 
   1187         punpcklbw   mm0,    mm7
   1188         punpcklbw   mm1,    mm7
   1189 
   1190         paddw       mm0,    mm1
   1191         paddw       mm0,    mm6
   1192 
   1193         punpckhbw   mm2,    mm7
   1194         punpckhbw   mm3,    mm7
   1195 
   1196         paddw       mm2,    mm3
   1197         paddw       mm2,    mm6
   1198 
   1199         psraw       mm0,    1
   1200         psraw       mm2,    1
   1201 
   1202         packuswb    mm0,    mm2
   1203         movq        mm2,    mm4
   1204 
   1205         punpcklbw   mm2,    mm0
   1206         movq        [rdi],  mm2
   1207 
   1208         punpckhbw   mm4,    mm0
   1209         movq        [rdi+8], mm4
   1210     }
   1211 }
   1212 
   1213 
   1214 
   1215 
   1216 
   1217 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
   1218 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
   1219 
   1220 
   1221 /****************************************************************************
   1222 *
   1223 *  ROUTINE       : horizontal_line_5_4_scale_mmx
   1224 *
   1225 *  INPUTS        : const unsigned char *source : Pointer to source data.
   1226 *                  unsigned int source_width    : Stride of source.
   1227 *                  unsigned char *dest         : Pointer to destination data.
   1228 *                  unsigned int dest_width      : Stride of destination (NOT USED).
   1229 *
   1230 *  OUTPUTS       : None.
   1231 *
   1232 *  RETURNS       : void
   1233 *
   1234 *  FUNCTION      : Copies horizontal line of pixels from source to
   1235 *                  destination scaling up by 4 to 5.
   1236 *
   1237 *  SPECIAL NOTES : None.
   1238 *
   1239 ****************************************************************************/
   1240 static
   1241 void horizontal_line_5_4_scale_mmx
   1242 (
   1243     const unsigned char *source,
   1244     unsigned int source_width,
   1245     unsigned char *dest,
   1246     unsigned int dest_width
   1247 )
   1248 {
   1249     /*
   1250     unsigned i;
   1251     unsigned int a, b, c, d, e;
   1252     unsigned char *des = dest;
   1253     const unsigned char *src = source;
   1254 
   1255     (void) dest_width;
   1256 
   1257     for ( i=0; i<source_width; i+=5 )
   1258     {
   1259         a = src[0];
   1260         b = src[1];
   1261         c = src[2];
   1262         d = src[3];
   1263         e = src[4];
   1264 
   1265         des[0] = a;
   1266         des[1] = ((b*192 + c* 64 + 128)>>8);
   1267         des[2] = ((c*128 + d*128 + 128)>>8);
   1268         des[3] = ((d* 64 + e*192 + 128)>>8);
   1269 
   1270         src += 5;
   1271         des += 4;
   1272     }
   1273     */
   1274     __asm
   1275     {
   1276 
   1277         mov         rsi,        source              ;
   1278         mov         rdi,        dest                ;
   1279 
   1280         mov         ecx,        source_width         ;
   1281         movq        mm5,        const54_1           ;
   1282 
   1283         pxor        mm7,        mm7                 ;
   1284         movq        mm6,        const54_2           ;
   1285 
   1286         movq        mm4,        round_values         ;
   1287         lea         rdx,        [rsi+rcx]           ;
   1288         horizontal_line_5_4_loop:
   1289 
   1290         movq        mm0,        QWORD PTR  [rsi]    ;
   1291         00 01 02 03 04 05 06 07
   1292         movq        mm1,        mm0                 ;
   1293         00 01 02 03 04 05 06 07
   1294 
   1295         psrlq       mm0,        8                   ;
   1296         01 02 03 04 05 06 07 xx
   1297         punpcklbw   mm1,        mm7                 ;
   1298         xx 00 xx 01 xx 02 xx 03
   1299 
   1300         punpcklbw   mm0,        mm7                 ;
   1301         xx 01 xx 02 xx 03 xx 04
   1302         pmullw      mm1,        mm5
   1303 
   1304         pmullw      mm0,        mm6
   1305         add         rsi,        5
   1306 
   1307         add         rdi,        4
   1308         paddw       mm1,        mm0
   1309 
   1310         paddw       mm1,        mm4
   1311         psrlw       mm1,        8
   1312 
   1313         cmp         rsi,        rdx
   1314         packuswb    mm1,        mm7
   1315 
   1316         movd        DWORD PTR [rdi-4], mm1
   1317 
   1318         jl          horizontal_line_5_4_loop
   1319 
   1320     }
   1321 
   1322 }
   1323 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
   1324 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
   1325 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
   1326 
   1327 static
   1328 void vertical_band_5_4_scale_mmx
   1329 (
   1330     unsigned char *source,
   1331     unsigned int src_pitch,
   1332     unsigned char *dest,
   1333     unsigned int dest_pitch,
   1334     unsigned int dest_width
   1335 )
   1336 {
   1337 
   1338     __asm
   1339     {
   1340 
   1341         mov         rsi,    source                    // Get the source and destination pointer
   1342         mov         ecx,    src_pitch               // Get the pitch size
   1343 
   1344         mov         rdi,    dest                    // tow lines below
   1345         pxor        mm7,    mm7                     // clear out mm7
   1346 
   1347         mov         edx,    dest_pitch               // Loop counter
   1348         mov         ebx,    dest_width
   1349 
   1350         vs_5_4_loop:
   1351 
   1352         movd        mm0,    DWORD ptr [rsi]         // src[0];
   1353         movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
   1354 
   1355         movd        mm2,    DWORD ptr [rsi+rcx*2]
   1356         lea         rax,    [rsi+rcx*2]             //
   1357 
   1358         punpcklbw   mm1,    mm7
   1359         punpcklbw   mm2,    mm7
   1360 
   1361         movq        mm3,    mm2
   1362         pmullw      mm1,    three_fourths
   1363 
   1364         pmullw      mm2,    one_fourths
   1365         movd        mm4,    [rax+rcx]
   1366 
   1367         pmullw      mm3,    two_fourths
   1368         punpcklbw   mm4,    mm7
   1369 
   1370         movq        mm5,    mm4
   1371         pmullw      mm4,    two_fourths
   1372 
   1373         paddw       mm1,    mm2
   1374         movd        mm6,    [rax+rcx*2]
   1375 
   1376         pmullw      mm5,    one_fourths
   1377         paddw       mm1,    round_values;
   1378 
   1379         paddw       mm3,    mm4
   1380         psrlw       mm1,    8
   1381 
   1382         punpcklbw   mm6,    mm7
   1383         paddw       mm3,    round_values
   1384 
   1385         pmullw      mm6,    three_fourths
   1386         psrlw       mm3,    8
   1387 
   1388         packuswb    mm1,    mm7
   1389         packuswb    mm3,    mm7
   1390 
   1391         movd        DWORD PTR [rdi], mm0
   1392         movd        DWORD PTR [rdi+rdx], mm1
   1393 
   1394 
   1395         paddw       mm5,    mm6
   1396         movd        DWORD PTR [rdi+rdx*2], mm3
   1397 
   1398         lea         rax,    [rdi+rdx*2]
   1399         paddw       mm5,    round_values
   1400 
   1401         psrlw       mm5,    8
   1402         add         rdi,    4
   1403 
   1404         packuswb    mm5,    mm7
   1405         movd        DWORD PTR [rax+rdx], mm5
   1406 
   1407         add         rsi,    4
   1408         sub         rbx,    4
   1409 
   1410         jg         vs_5_4_loop
   1411     }
   1412 }
   1413 
   1414 
   1415 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
   1416 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
   1417 
   1418 
   1419 static
   1420 void horizontal_line_5_3_scale_mmx
   1421 (
   1422     const unsigned char *source,
   1423     unsigned int source_width,
   1424     unsigned char *dest,
   1425     unsigned int dest_width
   1426 )
   1427 {
   1428     __asm
   1429     {
   1430 
   1431         mov         rsi,        source              ;
   1432         mov         rdi,        dest                ;
   1433 
   1434         mov         ecx,        source_width         ;
   1435         movq        mm5,        const53_1           ;
   1436 
   1437         pxor        mm7,        mm7                 ;
   1438         movq        mm6,        const53_2           ;
   1439 
   1440         movq        mm4,        round_values         ;
   1441         lea         rdx,        [rsi+rcx-5]         ;
   1442         horizontal_line_5_3_loop:
   1443 
   1444         movq        mm0,        QWORD PTR  [rsi]    ;
   1445         00 01 02 03 04 05 06 07
   1446         movq        mm1,        mm0                 ;
   1447         00 01 02 03 04 05 06 07
   1448 
   1449         psllw       mm0,        8                   ;
   1450         xx 00 xx 02 xx 04 xx 06
   1451         psrlw       mm1,        8                   ;
   1452         01 xx 03 xx 05 xx 07 xx
   1453 
   1454         psrlw       mm0,        8                   ;
   1455         00 xx 02 xx 04 xx 06 xx
   1456         psllq       mm1,        16                  ;
   1457         xx xx 01 xx 03 xx 05 xx
   1458 
   1459         pmullw      mm0,        mm6
   1460 
   1461         pmullw      mm1,        mm5
   1462         add         rsi,        5
   1463 
   1464         add         rdi,        3
   1465         paddw       mm1,        mm0
   1466 
   1467         paddw       mm1,        mm4
   1468         psrlw       mm1,        8
   1469 
   1470         cmp         rsi,        rdx
   1471         packuswb    mm1,        mm7
   1472 
   1473         movd        DWORD PTR [rdi-3], mm1
   1474         jl          horizontal_line_5_3_loop
   1475 
   1476 //exit condition
   1477         movq        mm0,        QWORD PTR  [rsi]    ;
   1478         00 01 02 03 04 05 06 07
   1479         movq        mm1,        mm0                 ;
   1480         00 01 02 03 04 05 06 07
   1481 
   1482         psllw       mm0,        8                   ;
   1483         xx 00 xx 02 xx 04 xx 06
   1484         psrlw       mm1,        8                   ;
   1485         01 xx 03 xx 05 xx 07 xx
   1486 
   1487         psrlw       mm0,        8                   ;
   1488         00 xx 02 xx 04 xx 06 xx
   1489         psllq       mm1,        16                  ;
   1490         xx xx 01 xx 03 xx 05 xx
   1491 
   1492         pmullw      mm0,        mm6
   1493 
   1494         pmullw      mm1,        mm5
   1495         paddw       mm1,        mm0
   1496 
   1497         paddw       mm1,        mm4
   1498         psrlw       mm1,        8
   1499 
   1500         packuswb    mm1,        mm7
   1501         movd        rax,        mm1
   1502 
   1503         mov         rdx,        rax
   1504         shr         rdx,        16
   1505 
   1506         mov         WORD PTR[rdi],   ax
   1507         mov         BYTE PTR[rdi+2], dl
   1508 
   1509     }
   1510 
   1511 }
   1512 
   1513 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
   1514 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
   1515 
   1516 static
   1517 void vertical_band_5_3_scale_mmx
   1518 (
   1519     unsigned char *source,
   1520     unsigned int src_pitch,
   1521     unsigned char *dest,
   1522     unsigned int dest_pitch,
   1523     unsigned int dest_width
   1524 )
   1525 {
   1526 
   1527     __asm
   1528     {
   1529 
   1530         mov         rsi,    source                    // Get the source and destination pointer
   1531         mov         ecx,    src_pitch               // Get the pitch size
   1532 
   1533         mov         rdi,    dest                    // tow lines below
   1534         pxor        mm7,    mm7                     // clear out mm7
   1535 
   1536         mov         edx,    dest_pitch               // Loop counter
   1537         movq        mm5,    one_thirds
   1538 
   1539         movq        mm6,    two_thirds
   1540         mov         ebx,    dest_width;
   1541 
   1542         vs_5_3_loop:
   1543 
   1544         movd        mm0,    DWORD ptr [rsi]         // src[0];
   1545         movd        mm1,    DWORD ptr [rsi+rcx]     // src[1];
   1546 
   1547         movd        mm2,    DWORD ptr [rsi+rcx*2]
   1548         lea         rax,    [rsi+rcx*2]             //
   1549 
   1550         punpcklbw   mm1,    mm7
   1551         punpcklbw   mm2,    mm7
   1552 
   1553         pmullw      mm1,    mm5
   1554         pmullw      mm2,    mm6
   1555 
   1556         movd        mm3,    DWORD ptr [rax+rcx]
   1557         movd        mm4,    DWORD ptr [rax+rcx*2]
   1558 
   1559         punpcklbw   mm3,    mm7
   1560         punpcklbw   mm4,    mm7
   1561 
   1562         pmullw      mm3,    mm6
   1563         pmullw      mm4,    mm5
   1564 
   1565 
   1566         movd        DWORD PTR [rdi], mm0
   1567         paddw       mm1,    mm2
   1568 
   1569         paddw       mm1,    round_values
   1570         psrlw       mm1,    8
   1571 
   1572         packuswb    mm1,    mm7
   1573         paddw       mm3,    mm4
   1574 
   1575         paddw       mm3,    round_values
   1576         movd        DWORD PTR [rdi+rdx], mm1
   1577 
   1578         psrlw       mm3,    8
   1579         packuswb    mm3,    mm7
   1580 
   1581         movd        DWORD PTR [rdi+rdx*2], mm3
   1582 
   1583 
   1584         add         rdi,    4
   1585         add         rsi,    4
   1586 
   1587         sub         rbx,    4
   1588         jg          vs_5_3_loop
   1589     }
   1590 }
   1591 
   1592 
   1593 
   1594 
   1595 /****************************************************************************
   1596 *
   1597 *  ROUTINE       : horizontal_line_2_1_scale
   1598 *
   1599 *  INPUTS        : const unsigned char *source :
   1600 *                  unsigned int source_width    :
   1601 *                  unsigned char *dest         :
   1602 *                  unsigned int dest_width      :
   1603 *
   1604 *  OUTPUTS       : None.
   1605 *
   1606 *  RETURNS       : void
   1607 *
   1608 *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
   1609 *
   1610 *  SPECIAL NOTES : None.
   1611 *
   1612 ****************************************************************************/
   1613 static
   1614 void horizontal_line_2_1_scale_mmx
   1615 (
   1616     const unsigned char *source,
   1617     unsigned int source_width,
   1618     unsigned char *dest,
   1619     unsigned int dest_width
   1620 )
   1621 {
   1622     (void) dest_width;
   1623 
   1624     __asm
   1625     {
   1626         mov         rsi,    source
   1627         mov         rdi,    dest
   1628 
   1629         pxor        mm7,    mm7
   1630         mov         ecx,    dest_width
   1631 
   1632         xor         rdx,    rdx
   1633         hs_2_1_loop:
   1634 
   1635         movq        mm0,    [rsi+rdx*2]
   1636         psllw       mm0,    8
   1637 
   1638         psrlw       mm0,    8
   1639         packuswb    mm0,    mm7
   1640 
   1641         movd        DWORD Ptr [rdi+rdx], mm0;
   1642         add         rdx,    4
   1643 
   1644         cmp         rdx,    rcx
   1645         jl          hs_2_1_loop
   1646 
   1647     }
   1648 }
   1649 
   1650 
   1651 
   1652 static
   1653 void vertical_band_2_1_scale_mmx
   1654 (
   1655     unsigned char *source,
   1656     unsigned int src_pitch,
   1657     unsigned char *dest,
   1658     unsigned int dest_pitch,
   1659     unsigned int dest_width)
   1660 {
   1661     vpx_memcpy(dest, source, dest_width);
   1662 }
   1663 
   1664 
   1665 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
   1666 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
   1667 
   1668 static
   1669 void vertical_band_2_1_scale_i_mmx
   1670 (
   1671     unsigned char *source,
   1672     unsigned int src_pitch,
   1673     unsigned char *dest,
   1674     unsigned int dest_pitch,
   1675     unsigned int dest_width
   1676 )
   1677 {
   1678     __asm
   1679     {
   1680         mov         rsi,        source
   1681         mov         rdi,        dest
   1682 
   1683         mov         eax,        src_pitch
   1684         mov         edx,        dest_width
   1685 
   1686         pxor        mm7,        mm7
   1687         sub         rsi,        rax             //back one line
   1688 
   1689 
   1690         lea         rcx,        [rsi+rdx];
   1691         movq        mm6,        round_values;
   1692 
   1693         movq        mm5,        three_sixteenths;
   1694         movq        mm4,        ten_sixteenths;
   1695 
   1696         vs_2_1_i_loop:
   1697         movd        mm0,        [rsi]           //
   1698         movd        mm1,        [rsi+rax]       //
   1699 
   1700         movd        mm2,        [rsi+rax*2]     //
   1701         punpcklbw   mm0,        mm7
   1702 
   1703         pmullw      mm0,        mm5
   1704         punpcklbw   mm1,        mm7
   1705 
   1706         pmullw      mm1,        mm4
   1707         punpcklbw   mm2,        mm7
   1708 
   1709         pmullw      mm2,        mm5
   1710         paddw       mm0,        round_values
   1711 
   1712         paddw       mm1,        mm2
   1713         paddw       mm0,        mm1
   1714 
   1715         psrlw       mm0,        8
   1716         packuswb    mm0,        mm7
   1717 
   1718         movd        DWORD PTR [rdi],        mm0
   1719         add         rsi,        4
   1720 
   1721         add         rdi,        4;
   1722         cmp         rsi,        rcx
   1723         jl          vs_2_1_i_loop
   1724 
   1725     }
   1726 }
   1727 
   1728 
   1729 
   1730 void
   1731 register_mmxscalers(void)
   1732 {
   1733     vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
   1734     vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
   1735     vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
   1736     vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
   1737     vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
   1738     vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
   1739     vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
   1740     vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
   1741     vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
   1742 
   1743     vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
   1744     vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
   1745     vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
   1746     vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
   1747     vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
   1748     vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
   1749     vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
   1750 }
   1751