Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include "memory.h"
     13 #include "preproc.h"
     14 #include "pragmas.h"
     15 
     16 /****************************************************************************
     17 *  Macros
     18 ****************************************************************************/
     19 #define FRAMECOUNT 7
     20 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
     21 
     22 /****************************************************************************
     23 *  Imports
     24 ****************************************************************************/
     25 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
     26 
     27 /****************************************************************************
     28 *  Exported Global Variables
     29 ****************************************************************************/
     30 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
     31 
     32 /****************************************************************************
     33  *
     34  *  ROUTINE       : temp_filter_wmt
     35  *
     36  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
     37  *                  unsigned char *s     : Pointer to source frame.
     38  *                  unsigned char *d     : Pointer to destination frame.
     39  *                  int bytes            : Number of bytes to filter.
     40  *                  int strength         : Strength of filter to apply.
     41  *
     42  *  OUTPUTS       : None.
     43  *
     44  *  RETURNS       : void
     45  *
     46  *  FUNCTION      : Performs a closesness adjusted temporarl blur
     47  *
     48  *  SPECIAL NOTES : Destination frame can be same as source frame.
     49  *
     50  ****************************************************************************/
     51 void temp_filter_wmt
     52 (
     53     pre_proc_instance *ppi,
     54     unsigned char *s,
     55     unsigned char *d,
     56     int bytes,
     57     int strength
     58 )
     59 {
     60     int byte = 0;
     61     unsigned char *frameptr = ppi->frame_buffer;
     62 
     63     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
     64     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
     65 
     66     if (ppi->frame == 0)
     67     {
     68         do
     69         {
     70             int i;
     71             int frame = 0;
     72 
     73             do
     74             {
     75                 for (i = 0; i < 8; i++)
     76                 {
     77                     *frameptr = s[byte+i];
     78                     ++frameptr;
     79                 }
     80 
     81                 ++frame;
     82             }
     83             while (frame < FRAMECOUNT);
     84 
     85             for (i = 0; i < 8; i++)
     86                 d[byte+i] = s[byte+i];
     87 
     88             byte += 8;
     89 
     90         }
     91         while (byte < bytes);
     92     }
     93     else
     94     {
     95         int i;
     96         int offset2 = (ppi->frame % FRAMECOUNT);
     97 
     98         do
     99         {
    100             __declspec(align(16)) unsigned short counts[8];
    101             __declspec(align(16)) unsigned short sums[8];
    102             __asm
    103             {
    104                 mov         eax, offset2
    105                 mov         edi, s                  // source pixels
    106                 pxor        xmm1, xmm1              // accumulator
    107 
    108                 pxor        xmm7, xmm7
    109 
    110                 mov         esi, frameptr           // accumulator
    111                 pxor        xmm2, xmm2              // count
    112 
    113                 movq        xmm3, QWORD PTR [edi]
    114 
    115                 movq        QWORD PTR [esi+8*eax], xmm3
    116 
    117                 punpcklbw   xmm3, xmm2              // xmm3 source pixels
    118                 mov         ecx,  FRAMECOUNT
    119 
    120                 next_frame:
    121                 movq        xmm4, QWORD PTR [esi]   // get frame buffer values
    122                 punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
    123                 movdqa      xmm6, xmm4              // save the pixel values
    124                 psubsw      xmm4, xmm3              // subtracted pixel values
    125                 pmullw      xmm4, xmm4              // square xmm4
    126                 movd        xmm5, strength
    127                 psrlw       xmm4, xmm5              // should be strength
    128                 pmullw      xmm4, threes            // 3 * modifier
    129                 movdqa      xmm5, sixteens          // 16s
    130                 psubusw     xmm5, xmm4              // 16 - modifiers
    131                 movdqa      xmm4, xmm5              // save the modifiers
    132                 pmullw      xmm4, xmm6              // multiplier values
    133                 paddusw     xmm1, xmm4              // accumulator
    134                 paddusw     xmm2, xmm5              // count
    135                 add         esi, 8                  // next frame
    136                 dec         ecx                     // next set of eight pixels
    137                 jnz         next_frame
    138 
    139                 movdqa      counts, xmm2
    140                 psrlw       xmm2, 1                 // divide count by 2 for rounding
    141                 paddusw     xmm1, xmm2              // rounding added in
    142 
    143                 mov         frameptr, esi
    144 
    145                 movdqa      sums, xmm1
    146             }
    147 
    148             for (i = 0; i < 8; i++)
    149             {
    150                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
    151                 blurvalue >>= 16;
    152                 d[i] = blurvalue;
    153             }
    154 
    155             s += 8;
    156             d += 8;
    157             byte += 8;
    158         }
    159         while (byte < bytes);
    160     }
    161 
    162     ++ppi->frame;
    163     __asm emms
    164 }
    165 
    166 /****************************************************************************
    167  *
    168  *  ROUTINE       : temp_filter_mmx
    169  *
    170  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
    171  *                  unsigned char *s     : Pointer to source frame.
    172  *                  unsigned char *d     : Pointer to destination frame.
    173  *                  int bytes            : Number of bytes to filter.
    174  *                  int strength         : Strength of filter to apply.
    175  *
    176  *  OUTPUTS       : None.
    177  *
    178  *  RETURNS       : void
    179  *
    180  *  FUNCTION      : Performs a closesness adjusted temporarl blur
    181  *
    182  *  SPECIAL NOTES : Destination frame can be same as source frame.
    183  *
    184  ****************************************************************************/
    185 void temp_filter_mmx
    186 (
    187     pre_proc_instance *ppi,
    188     unsigned char *s,
    189     unsigned char *d,
    190     int bytes,
    191     int strength
    192 )
    193 {
    194     int byte = 0;
    195     unsigned char *frameptr = ppi->frame_buffer;
    196 
    197     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
    198     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
    199 
    200     if (ppi->frame == 0)
    201     {
    202         do
    203         {
    204             int i;
    205             int frame = 0;
    206 
    207             do
    208             {
    209                 for (i = 0; i < 4; i++)
    210                 {
    211                     *frameptr = s[byte+i];
    212                     ++frameptr;
    213                 }
    214 
    215                 ++frame;
    216             }
    217             while (frame < FRAMECOUNT);
    218 
    219             for (i = 0; i < 4; i++)
    220                 d[byte+i] = s[byte+i];
    221 
    222             byte += 4;
    223 
    224         }
    225         while (byte < bytes);
    226     }
    227     else
    228     {
    229         int i;
    230         int offset2 = (ppi->frame % FRAMECOUNT);
    231 
    232         do
    233         {
    234             __declspec(align(16)) unsigned short counts[8];
    235             __declspec(align(16)) unsigned short sums[8];
    236             __asm
    237             {
    238 
    239                 mov         eax, offset2
    240                 mov         edi, s                  // source pixels
    241                 pxor        mm1, mm1                // accumulator
    242                 pxor        mm7, mm7
    243 
    244                 mov         esi, frameptr           // accumulator
    245                 pxor        mm2, mm2                // count
    246 
    247                 movd        mm3, DWORD PTR [edi]
    248                 movd        DWORD PTR [esi+4*eax], mm3
    249 
    250                 punpcklbw   mm3, mm2                // mm3 source pixels
    251                 mov         ecx,  FRAMECOUNT
    252 
    253                 next_frame:
    254                 movd        mm4, DWORD PTR [esi]    // get frame buffer values
    255                 punpcklbw   mm4, mm7                // mm4 frame buffer pixels
    256                 movq        mm6, mm4                // save the pixel values
    257                 psubsw      mm4, mm3                // subtracted pixel values
    258                 pmullw      mm4, mm4                // square mm4
    259                 movd        mm5, strength
    260                 psrlw       mm4, mm5                // should be strength
    261                 pmullw      mm4, threes             // 3 * modifier
    262                 movq        mm5, sixteens           // 16s
    263                 psubusw     mm5, mm4                // 16 - modifiers
    264                 movq        mm4, mm5                // save the modifiers
    265                 pmullw      mm4, mm6                // multiplier values
    266                 paddusw     mm1, mm4                // accumulator
    267                 paddusw     mm2, mm5                // count
    268                 add         esi, 4                  // next frame
    269                 dec         ecx                     // next set of eight pixels
    270                 jnz         next_frame
    271 
    272                 movq        counts, mm2
    273                 psrlw       mm2, 1                  // divide count by 2 for rounding
    274                 paddusw     mm1, mm2                // rounding added in
    275 
    276                 mov         frameptr, esi
    277 
    278                 movq        sums, mm1
    279 
    280             }
    281 
    282             for (i = 0; i < 4; i++)
    283             {
    284                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
    285                 blurvalue >>= 16;
    286                 d[i] = blurvalue;
    287             }
    288 
    289             s += 4;
    290             d += 4;
    291             byte += 4;
    292         }
    293         while (byte < bytes);
    294     }
    295 
    296     ++ppi->frame;
    297     __asm emms
    298 }
    299