Home | History | Annotate | Download | only in win32
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 /****************************************************************************
     13 *
     14 *   Module Title :     scaleopt.cpp
     15 *
     16 *   Description  :     Optimized scaling functions
     17 *
     18 ****************************************************************************/
     19 #include "pragmas.h"
     20 
     21 /****************************************************************************
     22 *  Module Statics
     23 ****************************************************************************/
     24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
     25 
     26 #include "vpx_scale/vpx_scale.h"
     27 #include "vpx_mem/vpx_mem.h"
     28 
     29 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
     30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
     31 
     32 
     33 /****************************************************************************
     34  *
     35  *  ROUTINE       : horizontal_line_5_4_scale_mmx
     36  *
     37  *  INPUTS        : const unsigned char *source : Pointer to source data.
     38  *                  unsigned int source_width    : Stride of source.
     39  *                  unsigned char *dest         : Pointer to destination data.
     40  *                  unsigned int dest_width      : Stride of destination (NOT USED).
     41  *
     42  *  OUTPUTS       : None.
     43  *
     44  *  RETURNS       : void
     45  *
     46  *  FUNCTION      : Copies horizontal line of pixels from source to
     47  *                  destination scaling up by 4 to 5.
     48  *
     49  *  SPECIAL NOTES : None.
     50  *
     51  ****************************************************************************/
     52 static
     53 void horizontal_line_5_4_scale_mmx
     54 (
     55   const unsigned char *source,
     56   unsigned int source_width,
     57   unsigned char *dest,
     58   unsigned int dest_width
     59 ) {
     60   /*
     61   unsigned i;
     62   unsigned int a, b, c, d, e;
     63   unsigned char *des = dest;
     64   const unsigned char *src = source;
     65 
     66   (void) dest_width;
     67 
     68   for ( i=0; i<source_width; i+=5 )
     69   {
     70       a = src[0];
     71       b = src[1];
     72       c = src[2];
     73       d = src[3];
     74       e = src[4];
     75 
     76       des[0] = a;
     77       des[1] = ((b*192 + c* 64 + 128)>>8);
     78       des[2] = ((c*128 + d*128 + 128)>>8);
     79       des[3] = ((d* 64 + e*192 + 128)>>8);
     80 
     81       src += 5;
     82       des += 4;
     83   }
     84   */
     85   (void) dest_width;
     86 
     87   __asm {
     88 
     89     mov         esi,        source;
     90     mov         edi,        dest;
     91 
     92     mov         ecx,        source_width;
     93     movq        mm5,        const54_1;
     94 
     95     pxor        mm7,        mm7;
     96     movq        mm6,        const54_2;
     97 
     98     movq        mm4,        round_values;
     99     lea         edx,        [esi+ecx];
    100     horizontal_line_5_4_loop:
    101 
    102     movq        mm0,        QWORD PTR  [esi];
    103     00 01 02 03 04 05 06 07
    104     movq        mm1,        mm0;
    105     00 01 02 03 04 05 06 07
    106 
    107     psrlq       mm0,        8;
    108     01 02 03 04 05 06 07 xx
    109     punpcklbw   mm1,        mm7;
    110     xx 00 xx 01 xx 02 xx 03
    111 
    112     punpcklbw   mm0,        mm7;
    113     xx 01 xx 02 xx 03 xx 04
    114     pmullw      mm1,        mm5
    115 
    116     pmullw      mm0,        mm6
    117     add         esi,        5
    118 
    119     add         edi,        4
    120     paddw       mm1,        mm0
    121 
    122     paddw       mm1,        mm4
    123     psrlw       mm1,        8
    124 
    125     cmp         esi,        edx
    126     packuswb    mm1,        mm7
    127 
    128     movd        DWORD PTR [edi-4], mm1
    129 
    130     jl          horizontal_line_5_4_loop
    131 
    132   }
    133 
    134 }
    135 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
    136 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
    137 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
    138 
    139 static
    140 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
    141 
    142   __asm {
    143     push        ebx
    144 
    145     mov         esi,    source                    // Get the source and destination pointer
    146     mov         ecx,    src_pitch               // Get the pitch size
    147 
    148     mov         edi,    dest                    // tow lines below
    149     pxor        mm7,    mm7                     // clear out mm7
    150 
    151     mov         edx,    dest_pitch               // Loop counter
    152     mov         ebx,    dest_width
    153 
    154     vs_5_4_loop:
    155 
    156     movd        mm0,    DWORD ptr [esi]         // src[0];
    157     movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
    158 
    159     movd        mm2,    DWORD ptr [esi+ecx*2]
    160     lea         eax,    [esi+ecx*2]             //
    161 
    162     punpcklbw   mm1,    mm7
    163     punpcklbw   mm2,    mm7
    164 
    165     movq        mm3,    mm2
    166     pmullw      mm1,    three_fourths
    167 
    168     pmullw      mm2,    one_fourths
    169     movd        mm4,    [eax+ecx]
    170 
    171     pmullw      mm3,    two_fourths
    172     punpcklbw   mm4,    mm7
    173 
    174     movq        mm5,    mm4
    175     pmullw      mm4,    two_fourths
    176 
    177     paddw       mm1,    mm2
    178     movd        mm6,    [eax+ecx*2]
    179 
    180     pmullw      mm5,    one_fourths
    181     paddw       mm1,    round_values;
    182 
    183     paddw       mm3,    mm4
    184     psrlw       mm1,    8
    185 
    186     punpcklbw   mm6,    mm7
    187     paddw       mm3,    round_values
    188 
    189     pmullw      mm6,    three_fourths
    190     psrlw       mm3,    8
    191 
    192     packuswb    mm1,    mm7
    193     packuswb    mm3,    mm7
    194 
    195     movd        DWORD PTR [edi], mm0
    196     movd        DWORD PTR [edi+edx], mm1
    197 
    198 
    199     paddw       mm5,    mm6
    200     movd        DWORD PTR [edi+edx*2], mm3
    201 
    202     lea         eax,    [edi+edx*2]
    203     paddw       mm5,    round_values
    204 
    205     psrlw       mm5,    8
    206     add         edi,    4
    207 
    208     packuswb    mm5,    mm7
    209     movd        DWORD PTR [eax+edx], mm5
    210 
    211     add         esi,    4
    212     sub         ebx,    4
    213 
    214     jg         vs_5_4_loop
    215 
    216     pop         ebx
    217   }
    218 }
    219 
    220 
    221 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
    222 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
    223 
    224 
    225 static
    226 void horizontal_line_5_3_scale_mmx
    227 (
    228   const unsigned char *source,
    229   unsigned int source_width,
    230   unsigned char *dest,
    231   unsigned int dest_width
    232 ) {
    233 
    234   (void) dest_width;
    235   __asm {
    236 
    237     mov         esi,        source;
    238     mov         edi,        dest;
    239 
    240     mov         ecx,        source_width;
    241     movq        mm5,        const53_1;
    242 
    243     pxor        mm7,        mm7;
    244     movq        mm6,        const53_2;
    245 
    246     movq        mm4,        round_values;
    247     lea         edx,        [esi+ecx-5];
    248     horizontal_line_5_3_loop:
    249 
    250     movq        mm0,        QWORD PTR  [esi];
    251     00 01 02 03 04 05 06 07
    252     movq        mm1,        mm0;
    253     00 01 02 03 04 05 06 07
    254 
    255     psllw       mm0,        8;
    256     xx 00 xx 02 xx 04 xx 06
    257     psrlw       mm1,        8;
    258     01 xx 03 xx 05 xx 07 xx
    259 
    260     psrlw       mm0,        8;
    261     00 xx 02 xx 04 xx 06 xx
    262     psllq       mm1,        16;
    263     xx xx 01 xx 03 xx 05 xx
    264 
    265     pmullw      mm0,        mm6
    266 
    267     pmullw      mm1,        mm5
    268     add         esi,        5
    269 
    270     add         edi,        3
    271     paddw       mm1,        mm0
    272 
    273     paddw       mm1,        mm4
    274     psrlw       mm1,        8
    275 
    276     cmp         esi,        edx
    277     packuswb    mm1,        mm7
    278 
    279     movd        DWORD PTR [edi-3], mm1
    280     jl          horizontal_line_5_3_loop
    281 
    282 // exit condition
    283     movq        mm0,        QWORD PTR  [esi];
    284     00 01 02 03 04 05 06 07
    285     movq        mm1,        mm0;
    286     00 01 02 03 04 05 06 07
    287 
    288     psllw       mm0,        8;
    289     xx 00 xx 02 xx 04 xx 06
    290     psrlw       mm1,        8;
    291     01 xx 03 xx 05 xx 07 xx
    292 
    293     psrlw       mm0,        8;
    294     00 xx 02 xx 04 xx 06 xx
    295     psllq       mm1,        16;
    296     xx xx 01 xx 03 xx 05 xx
    297 
    298     pmullw      mm0,        mm6
    299 
    300     pmullw      mm1,        mm5
    301     paddw       mm1,        mm0
    302 
    303     paddw       mm1,        mm4
    304     psrlw       mm1,        8
    305 
    306     packuswb    mm1,        mm7
    307     movd        eax,        mm1
    308 
    309     mov         edx,        eax
    310     shr         edx,        16
    311 
    312     mov         WORD PTR[edi],   ax
    313     mov         BYTE PTR[edi+2], dl
    314 
    315   }
    316 
    317 }
    318 
    319 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
    320 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
    321 
    322 static
    323 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
    324 
    325   __asm {
    326     push        ebx
    327 
    328     mov         esi,    source                    // Get the source and destination pointer
    329     mov         ecx,    src_pitch               // Get the pitch size
    330 
    331     mov         edi,    dest                    // tow lines below
    332     pxor        mm7,    mm7                     // clear out mm7
    333 
    334     mov         edx,    dest_pitch               // Loop counter
    335     movq        mm5,    one_thirds
    336 
    337     movq        mm6,    two_thirds
    338     mov         ebx,    dest_width;
    339 
    340     vs_5_3_loop:
    341 
    342     movd        mm0,    DWORD ptr [esi]         // src[0];
    343     movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
    344 
    345     movd        mm2,    DWORD ptr [esi+ecx*2]
    346     lea         eax,    [esi+ecx*2]             //
    347 
    348     punpcklbw   mm1,    mm7
    349     punpcklbw   mm2,    mm7
    350 
    351     pmullw      mm1,    mm5
    352     pmullw      mm2,    mm6
    353 
    354     movd        mm3,    DWORD ptr [eax+ecx]
    355     movd        mm4,    DWORD ptr [eax+ecx*2]
    356 
    357     punpcklbw   mm3,    mm7
    358     punpcklbw   mm4,    mm7
    359 
    360     pmullw      mm3,    mm6
    361     pmullw      mm4,    mm5
    362 
    363 
    364     movd        DWORD PTR [edi], mm0
    365     paddw       mm1,    mm2
    366 
    367     paddw       mm1,    round_values
    368     psrlw       mm1,    8
    369 
    370     packuswb    mm1,    mm7
    371     paddw       mm3,    mm4
    372 
    373     paddw       mm3,    round_values
    374     movd        DWORD PTR [edi+edx], mm1
    375 
    376     psrlw       mm3,    8
    377     packuswb    mm3,    mm7
    378 
    379     movd        DWORD PTR [edi+edx*2], mm3
    380 
    381 
    382     add         edi,    4
    383     add         esi,    4
    384 
    385     sub         ebx,    4
    386     jg          vs_5_3_loop
    387 
    388     pop         ebx
    389   }
    390 }
    391 
    392 
    393 
    394 
    395 /****************************************************************************
    396  *
    397  *  ROUTINE       : horizontal_line_2_1_scale
    398  *
    399  *  INPUTS        : const unsigned char *source :
    400  *                  unsigned int source_width    :
    401  *                  unsigned char *dest         :
    402  *                  unsigned int dest_width      :
    403  *
    404  *  OUTPUTS       : None.
    405  *
    406  *  RETURNS       : void
    407  *
    408  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
    409  *
    410  *  SPECIAL NOTES : None.
    411  *
    412  ****************************************************************************/
    413 static
    414 void horizontal_line_2_1_scale_mmx
    415 (
    416   const unsigned char *source,
    417   unsigned int source_width,
    418   unsigned char *dest,
    419   unsigned int dest_width
    420 ) {
    421   (void) dest_width;
    422   (void) source_width;
    423   __asm {
    424     mov         esi,    source
    425     mov         edi,    dest
    426 
    427     pxor        mm7,    mm7
    428     mov         ecx,    dest_width
    429 
    430     xor         edx,    edx
    431     hs_2_1_loop:
    432 
    433     movq        mm0,    [esi+edx*2]
    434     psllw       mm0,    8
    435 
    436     psrlw       mm0,    8
    437     packuswb    mm0,    mm7
    438 
    439     movd        DWORD Ptr [edi+edx], mm0;
    440     add         edx,    4
    441 
    442     cmp         edx,    ecx
    443     jl          hs_2_1_loop
    444 
    445   }
    446 }
    447 
    448 
    449 
    450 static
    451 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
    452   (void) dest_pitch;
    453   (void) src_pitch;
    454   vpx_memcpy(dest, source, dest_width);
    455 }
    456 
    457 
    458 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
    459 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
    460 
    461 static
    462 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
    463 
    464   (void) dest_pitch;
    465   __asm {
    466     mov         esi,        source
    467     mov         edi,        dest
    468 
    469     mov         eax,        src_pitch
    470     mov         edx,        dest_width
    471 
    472     pxor        mm7,        mm7
    473     sub         esi,        eax             // back one line
    474 
    475 
    476     lea         ecx,        [esi+edx];
    477     movq        mm6,        round_values;
    478 
    479     movq        mm5,        three_sixteenths;
    480     movq        mm4,        ten_sixteenths;
    481 
    482     vs_2_1_i_loop:
    483     movd        mm0,        [esi]           //
    484     movd        mm1,        [esi+eax]       //
    485 
    486     movd        mm2,        [esi+eax*2]     //
    487     punpcklbw   mm0,        mm7
    488 
    489     pmullw      mm0,        mm5
    490     punpcklbw   mm1,        mm7
    491 
    492     pmullw      mm1,        mm4
    493     punpcklbw   mm2,        mm7
    494 
    495     pmullw      mm2,        mm5
    496     paddw       mm0,        round_values
    497 
    498     paddw       mm1,        mm2
    499     paddw       mm0,        mm1
    500 
    501     psrlw       mm0,        8
    502     packuswb    mm0,        mm7
    503 
    504     movd        DWORD PTR [edi],        mm0
    505     add         esi,        4
    506 
    507     add         edi,        4;
    508     cmp         esi,        ecx
    509     jl          vs_2_1_i_loop
    510 
    511   }
    512 }
    513 
    514 
    515 
    516 void
    517 register_mmxscalers(void) {
    518   vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
    519   vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
    520   vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
    521   vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
    522   vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
    523   vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
    524   vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
    525 }
    526