Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include <math.h>
     13 #include <stdlib.h>
     14 #include "vpx_scale/yv12config.h"
     15 #include "pragmas.h"
     16 
     17 #define VP8_FILTER_WEIGHT 128
     18 #define VP8_FILTER_SHIFT  7
     19 
     20 
     21 
     22 /* static constants */
     23 __declspec(align(16))
     24 const static short  Blur[48] =
     25 {
     26 
     27     16, 16, 16, 16, 16, 16, 16, 16,
     28     16, 16, 16, 16, 16, 16, 16, 16,
     29     64, 64, 64, 64, 64, 64, 64, 64,
     30     16, 16, 16, 16, 16, 16, 16, 16,
     31     16, 16, 16, 16, 16, 16, 16, 16,
     32     0,  0,  0,  0,  0,  0,  0,  0,
     33 
     34 };
     35 #define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
     36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
     37 
     38 #ifndef RELOCATEABLE
     39 const static RD;
     40 const static R4D2;
     41 #endif
     42 
     43 
     44 /* external references */
     45 extern double vp8_gaussian(double sigma, double mu, double x);
     46 extern short vp8_rv[];
     47 extern int vp8_q2mbl(int x) ;
     48 
     49 
     50 
     51 void vp8_post_proc_down_and_across_mmx
     52 (
     53     unsigned char *src_ptr,
     54     unsigned char *dst_ptr,
     55     int src_pixels_per_line,
     56     int dst_pixels_per_line,
     57     int rows,
     58     int cols,
     59     int flimit
     60 )
     61 {
     62 #ifdef RELOCATEABLE
     63     RD
     64     R4D2
     65 #endif
     66 
     67     __asm
     68     {
     69         push        ebx
     70         lea         ebx, Blur
     71         movd        mm2, flimit
     72         punpcklwd   mm2, mm2
     73         punpckldq   mm2, mm2
     74 
     75         mov         esi,        src_ptr
     76         mov         edi,        dst_ptr
     77 
     78         mov         ecx, DWORD PTR rows
     79         mov         eax, src_pixels_per_line ;
     80         destination pitch?
     81         pxor        mm0, mm0              ;
     82         mm0 = 00000000
     83 
     84         nextrow:
     85 
     86         xor         edx,        edx       ;
     87 
     88         clear out edx for use as loop counter
     89         nextcol:
     90 
     91         pxor        mm7, mm7              ;
     92 
     93     mm7 = 00000000
     94     movq        mm6, [ebx + 32 ]      ;
     95         mm6 = kernel 2 taps
     96         movq        mm3, [esi]            ;
     97         mm4 = r0 p0..p7
     98         punpcklbw   mm3, mm0              ;
     99         mm3 = p0..p3
    100         movq        mm1, mm3              ;
    101         mm1 = p0..p3
    102         pmullw      mm3, mm6              ;
    103         mm3 *= kernel 2 modifiers
    104 
    105         movq        mm6, [ebx + 48]       ;
    106         mm6 = kernel 3 taps
    107         movq        mm5, [esi + eax]      ;
    108         mm4 = r1 p0..p7
    109         punpcklbw   mm5, mm0              ;
    110         mm5 = r1 p0..p3
    111         pmullw      mm6, mm5              ;
    112         mm6 *= p0..p3 * kernel 3 modifiers
    113         paddusw     mm3, mm6              ;
    114         mm3 += mm6
    115 
    116         ;
    117         thresholding
    118         movq        mm7, mm1              ;
    119         mm7 = r0 p0..p3
    120         psubusw     mm7, mm5              ;
    121         mm7 = r0 p0..p3 - r1 p0..p3
    122         psubusw     mm5, mm1              ;
    123         mm5 = r1 p0..p3 - r0 p0..p3
    124         paddusw     mm7, mm5              ;
    125         mm7 = abs(r0 p0..p3 - r1 p0..p3)
    126         pcmpgtw     mm7, mm2
    127 
    128         movq        mm6, [ebx + 64 ]      ;
    129         mm6 = kernel 4 modifiers
    130         movq        mm5, [esi + 2*eax]    ;
    131         mm4 = r2 p0..p7
    132         punpcklbw   mm5, mm0              ;
    133         mm5 = r2 p0..p3
    134         pmullw      mm6, mm5              ;
    135         mm5 *= kernel 4 modifiers
    136         paddusw     mm3, mm6              ;
    137         mm3 += mm5
    138 
    139         ;
    140         thresholding
    141         movq        mm6, mm1              ;
    142         mm6 = r0 p0..p3
    143         psubusw     mm6, mm5              ;
    144         mm6 = r0 p0..p3 - r2 p0..p3
    145         psubusw     mm5, mm1              ;
    146         mm5 = r2 p0..p3 - r2 p0..p3
    147         paddusw     mm6, mm5              ;
    148         mm6 = abs(r0 p0..p3 - r2 p0..p3)
    149         pcmpgtw     mm6, mm2
    150         por         mm7, mm6              ;
    151         accumulate thresholds
    152 
    153 
    154         neg         eax
    155         movq        mm6, [ebx ]           ;
    156         kernel 0 taps
    157         movq        mm5, [esi+2*eax]      ;
    158         mm4 = r-2 p0..p7
    159         punpcklbw   mm5, mm0              ;
    160         mm5 = r-2 p0..p3
    161         pmullw      mm6, mm5              ;
    162         mm5 *= kernel 0 modifiers
    163         paddusw     mm3, mm6              ;
    164         mm3 += mm5
    165 
    166         ;
    167         thresholding
    168         movq        mm6, mm1              ;
    169         mm6 = r0 p0..p3
    170         psubusw     mm6, mm5              ;
    171         mm6 = p0..p3 - r-2 p0..p3
    172         psubusw     mm5, mm1              ;
    173         mm5 = r-2 p0..p3 - p0..p3
    174         paddusw     mm6, mm5              ;
    175         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
    176         pcmpgtw     mm6, mm2
    177         por         mm7, mm6              ;
    178         accumulate thresholds
    179 
    180         movq        mm6, [ebx + 16]       ;
    181         kernel 1 taps
    182         movq        mm4, [esi+eax]        ;
    183         mm4 = r-1 p0..p7
    184         punpcklbw   mm4, mm0              ;
    185         mm4 = r-1 p0..p3
    186         pmullw      mm6, mm4              ;
    187         mm4 *= kernel 1 modifiers.
    188         paddusw     mm3, mm6              ;
    189         mm3 += mm5
    190 
    191         ;
    192         thresholding
    193         movq        mm6, mm1              ;
    194         mm6 = r0 p0..p3
    195         psubusw     mm6, mm4              ;
    196         mm6 = p0..p3 - r-2 p0..p3
    197         psubusw     mm4, mm1              ;
    198         mm5 = r-1 p0..p3 - p0..p3
    199         paddusw     mm6, mm4              ;
    200         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
    201         pcmpgtw     mm6, mm2
    202         por         mm7, mm6              ;
    203         accumulate thresholds
    204 
    205 
    206         paddusw     mm3, rd               ;
    207         mm3 += round value
    208         psraw       mm3, VP8_FILTER_SHIFT     ;
    209         mm3 /= 128
    210 
    211         pand        mm1, mm7              ;
    212         mm1 select vals > thresh from source
    213         pandn       mm7, mm3              ;
    214         mm7 select vals < thresh from blurred result
    215         paddusw     mm1, mm7              ;
    216         combination
    217 
    218         packuswb    mm1, mm0              ;
    219         pack to bytes
    220 
    221         movd        [edi], mm1            ;
    222         neg         eax                   ;
    223         pitch is positive
    224 
    225 
    226         add         esi, 4
    227         add         edi, 4
    228         add         edx, 4
    229 
    230         cmp         edx, cols
    231         jl          nextcol
    232         // done with the all cols, start the across filtering in place
    233         sub         esi, edx
    234         sub         edi, edx
    235 
    236 
    237         push        eax
    238         xor         edx,    edx
    239         mov         eax,    [edi-4];
    240 
    241         acrossnextcol:
    242         pxor        mm7, mm7              ;
    243         mm7 = 00000000
    244         movq        mm6, [ebx + 32 ]      ;
    245         movq        mm4, [edi+edx]        ;
    246         mm4 = p0..p7
    247         movq        mm3, mm4              ;
    248         mm3 = p0..p7
    249         punpcklbw   mm3, mm0              ;
    250         mm3 = p0..p3
    251         movq        mm1, mm3              ;
    252         mm1 = p0..p3
    253         pmullw      mm3, mm6              ;
    254         mm3 *= kernel 2 modifiers
    255 
    256         movq        mm6, [ebx + 48]
    257         psrlq       mm4, 8                ;
    258         mm4 = p1..p7
    259         movq        mm5, mm4              ;
    260         mm5 = p1..p7
    261         punpcklbw   mm5, mm0              ;
    262         mm5 = p1..p4
    263         pmullw      mm6, mm5              ;
    264         mm6 *= p1..p4 * kernel 3 modifiers
    265         paddusw     mm3, mm6              ;
    266         mm3 += mm6
    267 
    268         ;
    269         thresholding
    270         movq        mm7, mm1              ;
    271         mm7 = p0..p3
    272         psubusw     mm7, mm5              ;
    273         mm7 = p0..p3 - p1..p4
    274         psubusw     mm5, mm1              ;
    275         mm5 = p1..p4 - p0..p3
    276         paddusw     mm7, mm5              ;
    277         mm7 = abs(p0..p3 - p1..p4)
    278         pcmpgtw     mm7, mm2
    279 
    280         movq        mm6, [ebx + 64 ]
    281         psrlq       mm4, 8                ;
    282         mm4 = p2..p7
    283         movq        mm5, mm4              ;
    284         mm5 = p2..p7
    285         punpcklbw   mm5, mm0              ;
    286         mm5 = p2..p5
    287         pmullw      mm6, mm5              ;
    288         mm5 *= kernel 4 modifiers
    289         paddusw     mm3, mm6              ;
    290         mm3 += mm5
    291 
    292         ;
    293         thresholding
    294         movq        mm6, mm1              ;
    295         mm6 = p0..p3
    296         psubusw     mm6, mm5              ;
    297         mm6 = p0..p3 - p1..p4
    298         psubusw     mm5, mm1              ;
    299         mm5 = p1..p4 - p0..p3
    300         paddusw     mm6, mm5              ;
    301         mm6 = abs(p0..p3 - p1..p4)
    302         pcmpgtw     mm6, mm2
    303         por         mm7, mm6              ;
    304         accumulate thresholds
    305 
    306 
    307         movq        mm6, [ebx ]
    308         movq        mm4, [edi+edx-2]      ;
    309         mm4 = p-2..p5
    310         movq        mm5, mm4              ;
    311         mm5 = p-2..p5
    312         punpcklbw   mm5, mm0              ;
    313         mm5 = p-2..p1
    314         pmullw      mm6, mm5              ;
    315         mm5 *= kernel 0 modifiers
    316         paddusw     mm3, mm6              ;
    317         mm3 += mm5
    318 
    319         ;
    320         thresholding
    321         movq        mm6, mm1              ;
    322         mm6 = p0..p3
    323         psubusw     mm6, mm5              ;
    324         mm6 = p0..p3 - p1..p4
    325         psubusw     mm5, mm1              ;
    326         mm5 = p1..p4 - p0..p3
    327         paddusw     mm6, mm5              ;
    328         mm6 = abs(p0..p3 - p1..p4)
    329         pcmpgtw     mm6, mm2
    330         por         mm7, mm6              ;
    331         accumulate thresholds
    332 
    333         movq        mm6, [ebx + 16]
    334         psrlq       mm4, 8                ;
    335         mm4 = p-1..p5
    336         punpcklbw   mm4, mm0              ;
    337         mm4 = p-1..p2
    338         pmullw      mm6, mm4              ;
    339         mm4 *= kernel 1 modifiers.
    340         paddusw     mm3, mm6              ;
    341         mm3 += mm5
    342 
    343         ;
    344         thresholding
    345         movq        mm6, mm1              ;
    346         mm6 = p0..p3
    347         psubusw     mm6, mm4              ;
    348         mm6 = p0..p3 - p1..p4
    349         psubusw     mm4, mm1              ;
    350         mm5 = p1..p4 - p0..p3
    351         paddusw     mm6, mm4              ;
    352         mm6 = abs(p0..p3 - p1..p4)
    353         pcmpgtw     mm6, mm2
    354         por         mm7, mm6              ;
    355         accumulate thresholds
    356 
    357         paddusw     mm3, rd               ;
    358         mm3 += round value
    359         psraw       mm3, VP8_FILTER_SHIFT     ;
    360         mm3 /= 128
    361 
    362         pand        mm1, mm7              ;
    363         mm1 select vals > thresh from source
    364         pandn       mm7, mm3              ;
    365         mm7 select vals < thresh from blurred result
    366         paddusw     mm1, mm7              ;
    367         combination
    368 
    369         packuswb    mm1, mm0              ;
    370         pack to bytes
    371         mov         DWORD PTR [edi+edx-4],  eax   ;
    372         store previous four bytes
    373         movd        eax,    mm1
    374 
    375         add         edx, 4
    376         cmp         edx, cols
    377         jl          acrossnextcol;
    378 
    379         mov         DWORD PTR [edi+edx-4],  eax
    380         pop         eax
    381 
    382         // done with this rwo
    383         add         esi, eax               ;
    384         next line
    385         mov         eax, dst_pixels_per_line ;
    386         destination pitch?
    387         add         edi, eax               ;
    388         next destination
    389         mov         eax, src_pixels_per_line ;
    390         destination pitch?
    391 
    392         dec         ecx                   ;
    393         decrement count
    394         jnz         nextrow               ;
    395         next row
    396         pop         ebx
    397 
    398     }
    399 }
    400 
    401 
    402 
    403 void vp8_post_proc_down_and_across_xmm
    404 (
    405     unsigned char *src_ptr,
    406     unsigned char *dst_ptr,
    407     int src_pixels_per_line,
    408     int dst_pixels_per_line,
    409     int rows,
    410     int cols,
    411     int flimit
    412 )
    413 {
    414 #ifdef RELOCATEABLE
    415     R4D2
    416 #endif
    417 
    418     __asm
    419     {
    420         movd        xmm2,       flimit
    421         punpcklwd   xmm2,       xmm2
    422         punpckldq   xmm2,       xmm2
    423         punpcklqdq  xmm2,       xmm2
    424 
    425         mov         esi,        src_ptr
    426         mov         edi,        dst_ptr
    427 
    428         mov         ecx,        DWORD PTR rows
    429         mov         eax,        src_pixels_per_line ;
    430         destination pitch?
    431         pxor        xmm0,       xmm0              ;
    432         mm0 = 00000000
    433 
    434         nextrow:
    435 
    436         xor         edx,        edx       ;
    437 
    438         clear out edx for use as loop counter
    439         nextcol:
    440         movq        xmm3,       QWORD PTR [esi]         ;
    441 
    442         mm4 = r0 p0..p7
    443         punpcklbw   xmm3,       xmm0                    ;
    444         mm3 = p0..p3
    445         movdqa      xmm1,       xmm3                    ;
    446         mm1 = p0..p3
    447         psllw       xmm3,       2                       ;
    448 
    449         movq        xmm5,       QWORD PTR [esi + eax]   ;
    450         mm4 = r1 p0..p7
    451         punpcklbw   xmm5,       xmm0                    ;
    452         mm5 = r1 p0..p3
    453         paddusw     xmm3,       xmm5                    ;
    454         mm3 += mm6
    455 
    456         ;
    457         thresholding
    458         movdqa      xmm7,       xmm1                    ;
    459         mm7 = r0 p0..p3
    460         psubusw     xmm7,       xmm5                    ;
    461         mm7 = r0 p0..p3 - r1 p0..p3
    462         psubusw     xmm5,       xmm1                    ;
    463         mm5 = r1 p0..p3 - r0 p0..p3
    464         paddusw     xmm7,       xmm5                    ;
    465         mm7 = abs(r0 p0..p3 - r1 p0..p3)
    466         pcmpgtw     xmm7,       xmm2
    467 
    468         movq        xmm5,       QWORD PTR [esi + 2*eax] ;
    469         mm4 = r2 p0..p7
    470         punpcklbw   xmm5,       xmm0                    ;
    471         mm5 = r2 p0..p3
    472         paddusw     xmm3,       xmm5                    ;
    473         mm3 += mm5
    474 
    475         ;
    476         thresholding
    477         movdqa      xmm6,       xmm1                    ;
    478         mm6 = r0 p0..p3
    479         psubusw     xmm6,       xmm5                    ;
    480         mm6 = r0 p0..p3 - r2 p0..p3
    481         psubusw     xmm5,       xmm1                    ;
    482         mm5 = r2 p0..p3 - r2 p0..p3
    483         paddusw     xmm6,       xmm5                    ;
    484         mm6 = abs(r0 p0..p3 - r2 p0..p3)
    485         pcmpgtw     xmm6,       xmm2
    486         por         xmm7,       xmm6                    ;
    487         accumulate thresholds
    488 
    489 
    490         neg         eax
    491         movq        xmm5,       QWORD PTR [esi+2*eax]   ;
    492         mm4 = r-2 p0..p7
    493         punpcklbw   xmm5,       xmm0                    ;
    494         mm5 = r-2 p0..p3
    495         paddusw     xmm3,       xmm5                    ;
    496         mm3 += mm5
    497 
    498         ;
    499         thresholding
    500         movdqa      xmm6,       xmm1                    ;
    501         mm6 = r0 p0..p3
    502         psubusw     xmm6,       xmm5                    ;
    503         mm6 = p0..p3 - r-2 p0..p3
    504         psubusw     xmm5,       xmm1                    ;
    505         mm5 = r-2 p0..p3 - p0..p3
    506         paddusw     xmm6,       xmm5                    ;
    507         mm6 = abs(r0 p0..p3 - r-2 p0..p3)
    508         pcmpgtw     xmm6,       xmm2
    509         por         xmm7,       xmm6                    ;
    510         accumulate thresholds
    511 
    512         movq        xmm4,       QWORD PTR [esi+eax]     ;
    513         mm4 = r-1 p0..p7
    514         punpcklbw   xmm4,       xmm0                    ;
    515         mm4 = r-1 p0..p3
    516         paddusw     xmm3,       xmm4                    ;
    517         mm3 += mm5
    518 
    519         ;
    520         thresholding
    521         movdqa      xmm6,       xmm1                    ;
    522         mm6 = r0 p0..p3
    523         psubusw     xmm6,       xmm4                    ;
    524         mm6 = p0..p3 - r-2 p0..p3
    525         psubusw     xmm4,       xmm1                    ;
    526         mm5 = r-1 p0..p3 - p0..p3
    527         paddusw     xmm6,       xmm4                    ;
    528         mm6 = abs(r0 p0..p3 - r-1 p0..p3)
    529         pcmpgtw     xmm6,       xmm2
    530         por         xmm7,       xmm6                    ;
    531         accumulate thresholds
    532 
    533 
    534         paddusw     xmm3,       rd42                    ;
    535         mm3 += round value
    536         psraw       xmm3,       3                       ;
    537         mm3 /= 8
    538 
    539         pand        xmm1,       xmm7                    ;
    540         mm1 select vals > thresh from source
    541         pandn       xmm7,       xmm3                    ;
    542         mm7 select vals < thresh from blurred result
    543         paddusw     xmm1,       xmm7                    ;
    544         combination
    545 
    546         packuswb    xmm1,       xmm0                    ;
    547         pack to bytes
    548         movq        QWORD PTR [edi], xmm1             ;
    549 
    550         neg         eax                   ;
    551         pitch is positive
    552         add         esi,        8
    553         add         edi,        8
    554 
    555         add         edx,        8
    556         cmp         edx,        cols
    557 
    558         jl          nextcol
    559 
    560         // done with the all cols, start the across filtering in place
    561         sub         esi,        edx
    562         sub         edi,        edx
    563 
    564         xor         edx,        edx
    565         movq        mm0,        QWORD PTR [edi-8];
    566 
    567         acrossnextcol:
    568         movq        xmm7,       QWORD PTR [edi +edx -2]
    569         movd        xmm4,       DWORD PTR [edi +edx +6]
    570 
    571         pslldq      xmm4,       8
    572         por         xmm4,       xmm7
    573 
    574         movdqa      xmm3,       xmm4
    575         psrldq      xmm3,       2
    576         punpcklbw   xmm3,       xmm0              ;
    577         mm3 = p0..p3
    578         movdqa      xmm1,       xmm3              ;
    579         mm1 = p0..p3
    580         psllw       xmm3,       2
    581 
    582 
    583         movdqa      xmm5,       xmm4
    584         psrldq      xmm5,       3
    585         punpcklbw   xmm5,       xmm0              ;
    586         mm5 = p1..p4
    587         paddusw     xmm3,       xmm5              ;
    588         mm3 += mm6
    589 
    590         ;
    591         thresholding
    592         movdqa      xmm7,       xmm1              ;
    593         mm7 = p0..p3
    594         psubusw     xmm7,       xmm5              ;
    595         mm7 = p0..p3 - p1..p4
    596         psubusw     xmm5,       xmm1              ;
    597         mm5 = p1..p4 - p0..p3
    598         paddusw     xmm7,       xmm5              ;
    599         mm7 = abs(p0..p3 - p1..p4)
    600         pcmpgtw     xmm7,       xmm2
    601 
    602         movdqa      xmm5,       xmm4
    603         psrldq      xmm5,       4
    604         punpcklbw   xmm5,       xmm0              ;
    605         mm5 = p2..p5
    606         paddusw     xmm3,       xmm5              ;
    607         mm3 += mm5
    608 
    609         ;
    610         thresholding
    611         movdqa      xmm6,       xmm1              ;
    612         mm6 = p0..p3
    613         psubusw     xmm6,       xmm5              ;
    614         mm6 = p0..p3 - p1..p4
    615         psubusw     xmm5,       xmm1              ;
    616         mm5 = p1..p4 - p0..p3
    617         paddusw     xmm6,       xmm5              ;
    618         mm6 = abs(p0..p3 - p1..p4)
    619         pcmpgtw     xmm6,       xmm2
    620         por         xmm7,       xmm6              ;
    621         accumulate thresholds
    622 
    623 
    624         movdqa      xmm5,       xmm4              ;
    625         mm5 = p-2..p5
    626         punpcklbw   xmm5,       xmm0              ;
    627         mm5 = p-2..p1
    628         paddusw     xmm3,       xmm5              ;
    629         mm3 += mm5
    630 
    631         ;
    632         thresholding
    633         movdqa      xmm6,       xmm1              ;
    634         mm6 = p0..p3
    635         psubusw     xmm6,       xmm5              ;
    636         mm6 = p0..p3 - p1..p4
    637         psubusw     xmm5,       xmm1              ;
    638         mm5 = p1..p4 - p0..p3
    639         paddusw     xmm6,       xmm5              ;
    640         mm6 = abs(p0..p3 - p1..p4)
    641         pcmpgtw     xmm6,       xmm2
    642         por         xmm7,       xmm6              ;
    643         accumulate thresholds
    644 
    645         psrldq      xmm4,       1                   ;
    646         mm4 = p-1..p5
    647         punpcklbw   xmm4,       xmm0              ;
    648         mm4 = p-1..p2
    649         paddusw     xmm3,       xmm4              ;
    650         mm3 += mm5
    651 
    652         ;
    653         thresholding
    654         movdqa      xmm6,       xmm1              ;
    655         mm6 = p0..p3
    656         psubusw     xmm6,       xmm4              ;
    657         mm6 = p0..p3 - p1..p4
    658         psubusw     xmm4,       xmm1              ;
    659         mm5 = p1..p4 - p0..p3
    660         paddusw     xmm6,       xmm4              ;
    661         mm6 = abs(p0..p3 - p1..p4)
    662         pcmpgtw     xmm6,       xmm2
    663         por         xmm7,       xmm6              ;
    664         accumulate thresholds
    665 
    666         paddusw     xmm3,       rd42              ;
    667         mm3 += round value
    668         psraw       xmm3,       3                 ;
    669         mm3 /= 8
    670 
    671         pand        xmm1,       xmm7              ;
    672         mm1 select vals > thresh from source
    673         pandn       xmm7,       xmm3              ;
    674         mm7 select vals < thresh from blurred result
    675         paddusw     xmm1,       xmm7              ;
    676         combination
    677 
    678         packuswb    xmm1,       xmm0              ;
    679         pack to bytes
    680         movq        QWORD PTR [edi+edx-8],  mm0   ;
    681         store previous four bytes
    682         movdq2q     mm0,        xmm1
    683 
    684         add         edx,        8
    685         cmp         edx,        cols
    686         jl          acrossnextcol;
    687 
    688         // last 8 pixels
    689         movq        QWORD PTR [edi+edx-8],  mm0
    690 
    691         // done with this rwo
    692         add         esi, eax               ;
    693         next line
    694         mov         eax, dst_pixels_per_line ;
    695         destination pitch?
    696         add         edi, eax               ;
    697         next destination
    698         mov         eax, src_pixels_per_line ;
    699         destination pitch?
    700 
    701         dec         ecx                   ;
    702         decrement count
    703         jnz         nextrow               ;
    704         next row
    705     }
    706 }
    707 
    708 
    709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
    710 {
    711     int c, i;
    712     __declspec(align(16))
    713     int flimit2[2];
    714     __declspec(align(16))
    715     unsigned char d[16][8];
    716 
    717     flimit = vp8_q2mbl(flimit);
    718 
    719     for (i = 0; i < 2; i++)
    720         flimit2[i] = flimit;
    721 
    722     rows += 8;
    723 
    724     for (c = 0; c < cols; c += 4)
    725     {
    726         unsigned char *s = &dst[c];
    727 
    728         __asm
    729         {
    730             mov         esi,        s           ;
    731             pxor        mm0,        mm0     ;
    732 
    733             mov         eax,        pitch       ;
    734             neg         eax                                     // eax = -pitch
    735 
    736             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
    737             neg         eax
    738 
    739 
    740             pxor        mm5,        mm5
    741             pxor        mm6,        mm6     ;
    742 
    743             pxor        mm7,        mm7     ;
    744             mov         edi,        esi
    745 
    746             mov         ecx,        15          ;
    747 
    748             loop_initvar:
    749             movd        mm1,        DWORD PTR [edi];
    750             punpcklbw   mm1,        mm0     ;
    751 
    752             paddw       mm5,        mm1     ;
    753             pmullw      mm1,        mm1     ;
    754 
    755             movq        mm2,        mm1     ;
    756             punpcklwd   mm1,        mm0     ;
    757 
    758             punpckhwd   mm2,        mm0     ;
    759             paddd       mm6,        mm1     ;
    760 
    761             paddd       mm7,        mm2     ;
    762             lea         edi,        [edi+eax]   ;
    763 
    764             dec         ecx
    765             jne         loop_initvar
    766             //save the var and sum
    767             xor         edx,        edx
    768             loop_row:
    769             movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
    770             movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
    771 
    772             punpcklbw   mm1,        mm0
    773             punpcklbw   mm2,        mm0
    774 
    775             paddw       mm5,        mm2
    776             psubw       mm5,        mm1
    777 
    778             pmullw      mm2,        mm2
    779             movq        mm4,        mm2
    780 
    781             punpcklwd   mm2,        mm0
    782             punpckhwd   mm4,        mm0
    783 
    784             paddd       mm6,        mm2
    785             paddd       mm7,        mm4
    786 
    787             pmullw      mm1,        mm1
    788             movq        mm2,        mm1
    789 
    790             punpcklwd   mm1,        mm0
    791             psubd       mm6,        mm1
    792 
    793             punpckhwd   mm2,        mm0
    794             psubd       mm7,        mm2
    795 
    796 
    797             movq        mm3,        mm6
    798             pslld       mm3,        4
    799 
    800             psubd       mm3,        mm6
    801             movq        mm1,        mm5
    802 
    803             movq        mm4,        mm5
    804             pmullw      mm1,        mm1
    805 
    806             pmulhw      mm4,        mm4
    807             movq        mm2,        mm1
    808 
    809             punpcklwd   mm1,        mm4
    810             punpckhwd   mm2,        mm4
    811 
    812             movq        mm4,        mm7
    813             pslld       mm4,        4
    814 
    815             psubd       mm4,        mm7
    816 
    817             psubd       mm3,        mm1
    818             psubd       mm4,        mm2
    819 
    820             psubd       mm3,        flimit2
    821             psubd       mm4,        flimit2
    822 
    823             psrad       mm3,        31
    824             psrad       mm4,        31
    825 
    826             packssdw    mm3,        mm4
    827             packsswb    mm3,        mm0
    828 
    829             movd        mm1,        DWORD PTR [esi+eax*8]
    830 
    831             movq        mm2,        mm1
    832             punpcklbw   mm1,        mm0
    833 
    834             paddw       mm1,        mm5
    835             mov         ecx,        edx
    836 
    837             and         ecx,        127
    838             movq        mm4,        vp8_rv[ecx*2]
    839 
    840             paddw       mm1,        mm4
    841             //paddw     xmm1,       eight8s
    842             psraw       mm1,        4
    843 
    844             packuswb    mm1,        mm0
    845             pand        mm1,        mm3
    846 
    847             pandn       mm3,        mm2
    848             por         mm1,        mm3
    849 
    850             and         ecx,        15
    851             movd        DWORD PTR  d[ecx*4], mm1
    852 
    853             mov         ecx,        edx
    854             sub         ecx,        8
    855 
    856             and         ecx,        15
    857             movd        mm1,        DWORD PTR d[ecx*4]
    858 
    859             movd        [esi],      mm1
    860             lea         esi,        [esi+eax]
    861 
    862             lea         edi,        [edi+eax]
    863             add         edx,        1
    864 
    865             cmp         edx,        rows
    866             jl          loop_row
    867 
    868         }
    869 
    870     }
    871 }
    872 
    873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
    874 {
    875     int c, i;
    876     __declspec(align(16))
    877     int flimit4[4];
    878     __declspec(align(16))
    879     unsigned char d[16][8];
    880 
    881     flimit = vp8_q2mbl(flimit);
    882 
    883     for (i = 0; i < 4; i++)
    884         flimit4[i] = flimit;
    885 
    886     rows += 8;
    887 
    888     for (c = 0; c < cols; c += 8)
    889     {
    890         unsigned char *s = &dst[c];
    891 
    892         __asm
    893         {
    894             mov         esi,        s           ;
    895             pxor        xmm0,       xmm0        ;
    896 
    897             mov         eax,        pitch       ;
    898             neg         eax                                     // eax = -pitch
    899 
    900             lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
    901             neg         eax
    902 
    903 
    904             pxor        xmm5,       xmm5
    905             pxor        xmm6,       xmm6        ;
    906 
    907             pxor        xmm7,       xmm7        ;
    908             mov         edi,        esi
    909 
    910             mov         ecx,        15          ;
    911 
    912             loop_initvar:
    913             movq        xmm1,       QWORD PTR [edi];
    914             punpcklbw   xmm1,       xmm0        ;
    915 
    916             paddw       xmm5,       xmm1        ;
    917             pmullw      xmm1,       xmm1        ;
    918 
    919             movdqa      xmm2,       xmm1        ;
    920             punpcklwd   xmm1,       xmm0        ;
    921 
    922             punpckhwd   xmm2,       xmm0        ;
    923             paddd       xmm6,       xmm1        ;
    924 
    925             paddd       xmm7,       xmm2        ;
    926             lea         edi,        [edi+eax]   ;
    927 
    928             dec         ecx
    929             jne         loop_initvar
    930             //save the var and sum
    931             xor         edx,        edx
    932             loop_row:
    933             movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
    934             movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
    935 
    936             punpcklbw   xmm1,       xmm0
    937             punpcklbw   xmm2,       xmm0
    938 
    939             paddw       xmm5,       xmm2
    940             psubw       xmm5,       xmm1
    941 
    942             pmullw      xmm2,       xmm2
    943             movdqa      xmm4,       xmm2
    944 
    945             punpcklwd   xmm2,       xmm0
    946             punpckhwd   xmm4,       xmm0
    947 
    948             paddd       xmm6,       xmm2
    949             paddd       xmm7,       xmm4
    950 
    951             pmullw      xmm1,       xmm1
    952             movdqa      xmm2,       xmm1
    953 
    954             punpcklwd   xmm1,       xmm0
    955             psubd       xmm6,       xmm1
    956 
    957             punpckhwd   xmm2,       xmm0
    958             psubd       xmm7,       xmm2
    959 
    960 
    961             movdqa      xmm3,       xmm6
    962             pslld       xmm3,       4
    963 
    964             psubd       xmm3,       xmm6
    965             movdqa      xmm1,       xmm5
    966 
    967             movdqa      xmm4,       xmm5
    968             pmullw      xmm1,       xmm1
    969 
    970             pmulhw      xmm4,       xmm4
    971             movdqa      xmm2,       xmm1
    972 
    973             punpcklwd   xmm1,       xmm4
    974             punpckhwd   xmm2,       xmm4
    975 
    976             movdqa      xmm4,       xmm7
    977             pslld       xmm4,       4
    978 
    979             psubd       xmm4,       xmm7
    980 
    981             psubd       xmm3,       xmm1
    982             psubd       xmm4,       xmm2
    983 
    984             psubd       xmm3,       flimit4
    985             psubd       xmm4,       flimit4
    986 
    987             psrad       xmm3,       31
    988             psrad       xmm4,       31
    989 
    990             packssdw    xmm3,       xmm4
    991             packsswb    xmm3,       xmm0
    992 
    993             movq        xmm1,       QWORD PTR [esi+eax*8]
    994 
    995             movq        xmm2,       xmm1
    996             punpcklbw   xmm1,       xmm0
    997 
    998             paddw       xmm1,       xmm5
    999             mov         ecx,        edx
   1000 
   1001             and         ecx,        127
   1002             movdqu      xmm4,       vp8_rv[ecx*2]
   1003 
   1004             paddw       xmm1,       xmm4
   1005             //paddw     xmm1,       eight8s
   1006             psraw       xmm1,       4
   1007 
   1008             packuswb    xmm1,       xmm0
   1009             pand        xmm1,       xmm3
   1010 
   1011             pandn       xmm3,       xmm2
   1012             por         xmm1,       xmm3
   1013 
   1014             and         ecx,        15
   1015             movq        QWORD PTR  d[ecx*8], xmm1
   1016 
   1017             mov         ecx,        edx
   1018             sub         ecx,        8
   1019 
   1020             and         ecx,        15
   1021             movq        mm0,        d[ecx*8]
   1022 
   1023             movq        [esi],      mm0
   1024             lea         esi,        [esi+eax]
   1025 
   1026             lea         edi,        [edi+eax]
   1027             add         edx,        1
   1028 
   1029             cmp         edx,        rows
   1030             jl          loop_row
   1031 
   1032         }
   1033 
   1034     }
   1035 }
   1036 #if 0
   1037 /****************************************************************************
   1038  *
   1039  *  ROUTINE       : plane_add_noise_wmt
   1040  *
   1041  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
   1042  *                                  noise to
   1043  *                  unsigned int Width    width of plane
   1044  *                  unsigned int Height   height of plane
   1045  *                  int  Pitch    distance between subsequent lines of frame
   1046  *                  int  q        quantizer used to determine amount of noise
   1047  *                                  to add
   1048  *
   1049  *  OUTPUTS       : None.
   1050  *
   1051  *  RETURNS       : void.
   1052  *
   1053  *  FUNCTION      : adds gaussian noise to a plane of pixels
   1054  *
   1055  *  SPECIAL NOTES : None.
   1056  *
   1057  ****************************************************************************/
   1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
   1059 {
   1060     unsigned int i;
   1061 
   1062     __declspec(align(16)) unsigned char blackclamp[16];
   1063     __declspec(align(16)) unsigned char whiteclamp[16];
   1064     __declspec(align(16)) unsigned char bothclamp[16];
   1065     char char_dist[300];
   1066     char Rand[2048];
   1067     double sigma;
   1068 //    return;
   1069     __asm emms
   1070     sigma = a + .5 + .6 * (63 - q) / 63.0;
   1071 
   1072     // set up a lookup table of 256 entries that matches
   1073     // a gaussian distribution with sigma determined by q.
   1074     //
   1075     {
   1076         double i;
   1077         int next, j;
   1078 
   1079         next = 0;
   1080 
   1081         for (i = -32; i < 32; i++)
   1082         {
   1083             double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
   1084             int a = (int)(g + .5);
   1085 
   1086             if (a)
   1087             {
   1088                 for (j = 0; j < a; j++)
   1089                 {
   1090                     char_dist[next+j] = (char) i;
   1091                 }
   1092 
   1093                 next = next + j;
   1094             }
   1095 
   1096         }
   1097 
   1098         for (next = next; next < 256; next++)
   1099             char_dist[next] = 0;
   1100 
   1101     }
   1102 
   1103     for (i = 0; i < 2048; i++)
   1104     {
   1105         Rand[i] = char_dist[rand() & 0xff];
   1106     }
   1107 
   1108     for (i = 0; i < 16; i++)
   1109     {
   1110         blackclamp[i] = -char_dist[0];
   1111         whiteclamp[i] = -char_dist[0];
   1112         bothclamp[i] = -2 * char_dist[0];
   1113     }
   1114 
   1115     for (i = 0; i < Height; i++)
   1116     {
   1117         unsigned char *Pos = Start + i * Pitch;
   1118         char  *Ref = Rand + (rand() & 0xff);
   1119 
   1120         __asm
   1121         {
   1122             mov ecx, [Width]
   1123             mov esi, Pos
   1124             mov edi, Ref
   1125             xor         eax, eax
   1126 
   1127             nextset:
   1128             movdqu      xmm1, [esi+eax]        // get the source
   1129 
   1130             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
   1131             paddusb     xmm1, bothclamp
   1132             psubusb     xmm1, whiteclamp
   1133 
   1134             movdqu      xmm2, [edi+eax]        // get the noise for this line
   1135             paddb       xmm1, xmm2             // add it in
   1136             movdqu      [esi+eax], xmm1        // store the result
   1137 
   1138             add         eax, 16                // move to the next line
   1139 
   1140             cmp         eax, ecx
   1141             jl          nextset
   1142 
   1143 
   1144         }
   1145 
   1146     }
   1147 }
   1148 #endif
   1149 __declspec(align(16))
   1150 static const int four8s[4] = { 8, 8, 8, 8};
   1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
   1152 {
   1153     int r, i;
   1154     __declspec(align(16))
   1155     int flimit4[4];
   1156     unsigned char *s = src;
   1157     int sumsq;
   1158     int sum;
   1159 
   1160 
   1161     flimit = vp8_q2mbl(flimit);
   1162     flimit4[0] =
   1163         flimit4[1] =
   1164             flimit4[2] =
   1165                 flimit4[3] = flimit;
   1166 
   1167     for (r = 0; r < rows; r++)
   1168     {
   1169 
   1170 
   1171         sumsq = 0;
   1172         sum = 0;
   1173 
   1174         for (i = -8; i <= 6; i++)
   1175         {
   1176             sumsq += s[i] * s[i];
   1177             sum   += s[i];
   1178         }
   1179 
   1180         __asm
   1181         {
   1182             mov         eax,    sumsq
   1183             movd        xmm7,   eax
   1184 
   1185             mov         eax,    sum
   1186             movd        xmm6,   eax
   1187 
   1188             mov         esi,    s
   1189             xor         ecx,    ecx
   1190 
   1191             mov         edx,    cols
   1192             add         edx,    8
   1193             pxor        mm0,    mm0
   1194             pxor        mm1,    mm1
   1195 
   1196             pxor        xmm0,   xmm0
   1197             nextcol4:
   1198 
   1199             movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
   1200             movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
   1201 
   1202             punpcklbw   xmm1,   xmm0                    // expanding
   1203             punpcklbw   xmm2,   xmm0                    // expanding
   1204 
   1205             punpcklwd   xmm1,   xmm0                    // expanding to dwords
   1206             punpcklwd   xmm2,   xmm0                    // expanding to dwords
   1207 
   1208             psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
   1209             paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
   1210 
   1211             paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
   1212             pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
   1213 
   1214             paddd       xmm6,   xmm2
   1215             paddd       xmm7,   xmm1
   1216 
   1217             pshufd      xmm6,   xmm6,   0               // duplicate the last ones
   1218             pshufd      xmm7,   xmm7,   0               // duplicate the last ones
   1219 
   1220             psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
   1221             psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
   1222 
   1223             pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
   1224             pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
   1225 
   1226             paddd       xmm6,   xmm4
   1227             paddd       xmm7,   xmm3
   1228 
   1229             pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
   1230             pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
   1231 
   1232             paddd       xmm7,   xmm3
   1233             paddd       xmm6,   xmm4
   1234 
   1235             pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
   1236             pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
   1237 
   1238             paddd       xmm7,   xmm3
   1239             paddd       xmm6,   xmm4
   1240 
   1241             movdqa      xmm3,   xmm6
   1242             pmaddwd     xmm3,   xmm3
   1243 
   1244             movdqa      xmm5,   xmm7
   1245             pslld       xmm5,   4
   1246 
   1247             psubd       xmm5,   xmm7
   1248             psubd       xmm5,   xmm3
   1249 
   1250             psubd       xmm5,   flimit4
   1251             psrad       xmm5,   31
   1252 
   1253             packssdw    xmm5,   xmm0
   1254             packsswb    xmm5,   xmm0
   1255 
   1256             movd        xmm1,   DWORD PTR [esi+ecx]
   1257             movq        xmm2,   xmm1
   1258 
   1259             punpcklbw   xmm1,   xmm0
   1260             punpcklwd   xmm1,   xmm0
   1261 
   1262             paddd       xmm1,   xmm6
   1263             paddd       xmm1,   four8s
   1264 
   1265             psrad       xmm1,   4
   1266             packssdw    xmm1,   xmm0
   1267 
   1268             packuswb    xmm1,   xmm0
   1269             pand        xmm1,   xmm5
   1270 
   1271             pandn       xmm5,   xmm2
   1272             por         xmm5,   xmm1
   1273 
   1274             movd        [esi+ecx-8],  mm0
   1275             movq        mm0,    mm1
   1276 
   1277             movdq2q     mm1,    xmm5
   1278             psrldq      xmm7,   12
   1279 
   1280             psrldq      xmm6,   12
   1281             add         ecx,    4
   1282 
   1283             cmp         ecx,    edx
   1284             jl          nextcol4
   1285 
   1286         }
   1287         s += pitch;
   1288     }
   1289 }
   1290 
   1291 #if 0
   1292 
   1293 /****************************************************************************
   1294  *
   1295  *  ROUTINE       : plane_add_noise_mmx
   1296  *
   1297  *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
   1298  *                                  noise to
   1299  *                  unsigned int Width    width of plane
   1300  *                  unsigned int Height   height of plane
   1301  *                  int  Pitch    distance between subsequent lines of frame
   1302  *                  int  q        quantizer used to determine amount of noise
   1303  *                                  to add
   1304  *
   1305  *  OUTPUTS       : None.
   1306  *
   1307  *  RETURNS       : void.
   1308  *
   1309  *  FUNCTION      : adds gaussian noise to a plane of pixels
   1310  *
   1311  *  SPECIAL NOTES : None.
   1312  *
   1313  ****************************************************************************/
   1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
   1315 {
   1316     unsigned int i;
   1317     int Pitch4 = Pitch * 4;
   1318     const int noise_amount = 2;
   1319     const int noise_adder = 2 * noise_amount + 1;
   1320 
   1321     __declspec(align(16)) unsigned char blackclamp[16];
   1322     __declspec(align(16)) unsigned char whiteclamp[16];
   1323     __declspec(align(16)) unsigned char bothclamp[16];
   1324 
   1325     char char_dist[300];
   1326     char Rand[2048];
   1327 
   1328     double sigma;
   1329     __asm emms
   1330     sigma = a + .5 + .6 * (63 - q) / 63.0;
   1331 
   1332     // set up a lookup table of 256 entries that matches
   1333     // a gaussian distribution with sigma determined by q.
   1334     //
   1335     {
   1336         double i, sum = 0;
   1337         int next, j;
   1338 
   1339         next = 0;
   1340 
   1341         for (i = -32; i < 32; i++)
   1342         {
   1343             int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
   1344 
   1345             if (a)
   1346             {
   1347                 for (j = 0; j < a; j++)
   1348                 {
   1349                     char_dist[next+j] = (char) i;
   1350                 }
   1351 
   1352                 next = next + j;
   1353             }
   1354 
   1355         }
   1356 
   1357         for (next = next; next < 256; next++)
   1358             char_dist[next] = 0;
   1359 
   1360     }
   1361 
   1362     for (i = 0; i < 2048; i++)
   1363     {
   1364         Rand[i] = char_dist[rand() & 0xff];
   1365     }
   1366 
   1367     for (i = 0; i < 16; i++)
   1368     {
   1369         blackclamp[i] = -char_dist[0];
   1370         whiteclamp[i] = -char_dist[0];
   1371         bothclamp[i] = -2 * char_dist[0];
   1372     }
   1373 
   1374     for (i = 0; i < Height; i++)
   1375     {
   1376         unsigned char *Pos = Start + i * Pitch;
   1377         char  *Ref = Rand + (rand() & 0xff);
   1378 
   1379         __asm
   1380         {
   1381             mov ecx, [Width]
   1382             mov esi, Pos
   1383             mov edi, Ref
   1384             xor         eax, eax
   1385 
   1386             nextset:
   1387             movq        mm1, [esi+eax]        // get the source
   1388 
   1389             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
   1390             paddusb     mm1, bothclamp
   1391             psubusb     mm1, whiteclamp
   1392 
   1393             movq        mm2, [edi+eax]        // get the noise for this line
   1394             paddb       mm1, mm2             // add it in
   1395             movq        [esi+eax], mm1        // store the result
   1396 
   1397             add         eax, 8                // move to the next line
   1398 
   1399             cmp         eax, ecx
   1400             jl          nextset
   1401 
   1402 
   1403         }
   1404 
   1405     }
   1406 }
   1407 #else
   1408 extern char an[8][64][3072];
   1409 extern int cd[8][64];
   1410 
   1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
   1412 {
   1413     unsigned int i;
   1414     __declspec(align(16)) unsigned char blackclamp[16];
   1415     __declspec(align(16)) unsigned char whiteclamp[16];
   1416     __declspec(align(16)) unsigned char bothclamp[16];
   1417 
   1418 
   1419     __asm emms
   1420 
   1421     for (i = 0; i < 16; i++)
   1422     {
   1423         blackclamp[i] = -cd[a][q];
   1424         whiteclamp[i] = -cd[a][q];
   1425         bothclamp[i] = -2 * cd[a][q];
   1426     }
   1427 
   1428     for (i = 0; i < Height; i++)
   1429     {
   1430         unsigned char *Pos = Start + i * Pitch;
   1431         char  *Ref = an[a][q] + (rand() & 0xff);
   1432 
   1433         __asm
   1434         {
   1435             mov ecx, [Width]
   1436             mov esi, Pos
   1437             mov edi, Ref
   1438             xor         eax, eax
   1439 
   1440             nextset:
   1441             movq        mm1, [esi+eax]        // get the source
   1442 
   1443             psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
   1444             paddusb     mm1, bothclamp
   1445             psubusb     mm1, whiteclamp
   1446 
   1447             movq        mm2, [edi+eax]        // get the noise for this line
   1448             paddb       mm1, mm2             // add it in
   1449             movq        [esi+eax], mm1        // store the result
   1450 
   1451             add         eax, 8                // move to the next line
   1452 
   1453             cmp         eax, ecx
   1454             jl          nextset
   1455         }
   1456     }
   1457 }
   1458 
   1459 
   1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
   1461 {
   1462     unsigned int i;
   1463 
   1464     __declspec(align(16)) unsigned char blackclamp[16];
   1465     __declspec(align(16)) unsigned char whiteclamp[16];
   1466     __declspec(align(16)) unsigned char bothclamp[16];
   1467 
   1468     __asm emms
   1469 
   1470     for (i = 0; i < 16; i++)
   1471     {
   1472         blackclamp[i] = -cd[a][q];
   1473         whiteclamp[i] = -cd[a][q];
   1474         bothclamp[i] = -2 * cd[a][q];
   1475     }
   1476 
   1477     for (i = 0; i < Height; i++)
   1478     {
   1479         unsigned char *Pos = Start + i * Pitch;
   1480         char *Ref = an[a][q] + (rand() & 0xff);
   1481 
   1482         __asm
   1483         {
   1484             mov ecx,    [Width]
   1485             mov esi,    Pos
   1486             mov edi,    Ref
   1487             xor         eax, eax
   1488 
   1489             nextset:
   1490             movdqu      xmm1, [esi+eax]        // get the source
   1491 
   1492             psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
   1493             paddusb     xmm1, bothclamp
   1494             psubusb     xmm1, whiteclamp
   1495 
   1496             movdqu      xmm2, [edi+eax]        // get the noise for this line
   1497             paddb       xmm1, xmm2             // add it in
   1498             movdqu      [esi+eax], xmm1        // store the result
   1499 
   1500             add         eax, 16                // move to the next line
   1501 
   1502             cmp         eax, ecx
   1503             jl          nextset
   1504         }
   1505     }
   1506 }
   1507 
   1508 #endif
   1509