Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void vp8_post_proc_down_and_across_xmm
     15 ;(
     16 ;    unsigned char *src_ptr,
     17 ;    unsigned char *dst_ptr,
     18 ;    int src_pixels_per_line,
     19 ;    int dst_pixels_per_line,
     20 ;    int rows,
     21 ;    int cols,
     22 ;    int flimit
     23 ;)
     24 global sym(vp8_post_proc_down_and_across_xmm)
     25 sym(vp8_post_proc_down_and_across_xmm):
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 7
     29     SAVE_XMM
     30     GET_GOT     rbx
     31     push        rsi
     32     push        rdi
     33     ; end prolog
     34 
     35 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
     36     ALIGN_STACK 16, rax
     37     ; move the global rd onto the stack, since we don't have enough registers
     38     ; to do PIC addressing
     39     movdqa      xmm0, [GLOBAL(rd42)]
     40     sub         rsp, 16
     41     movdqa      [rsp], xmm0
     42 %define RD42 [rsp]
     43 %else
     44 %define RD42 [GLOBAL(rd42)]
     45 %endif
     46 
     47 
     48         movd        xmm2,       dword ptr arg(6) ;flimit
     49         punpcklwd   xmm2,       xmm2
     50         punpckldq   xmm2,       xmm2
     51         punpcklqdq  xmm2,       xmm2
     52 
     53         mov         rsi,        arg(0) ;src_ptr
     54         mov         rdi,        arg(1) ;dst_ptr
     55 
     56         movsxd      rcx,        DWORD PTR arg(4) ;rows
     57         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
     58         pxor        xmm0,       xmm0              ; mm0 = 00000000
     59 
     60 nextrow:
     61 
     62         xor         rdx,        rdx       ; clear out rdx for use as loop counter
     63 nextcol:
     64         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
     65         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
     66         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
     67         psllw       xmm3,       2                       ;
     68 
     69         movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
     70         punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
     71         paddusw     xmm3,       xmm5                    ; mm3 += mm6
     72 
     73         ; thresholding
     74         movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
     75         psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
     76         psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
     77         paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
     78         pcmpgtw     xmm7,       xmm2
     79 
     80         movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
     81         punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
     82         paddusw     xmm3,       xmm5                    ; mm3 += mm5
     83 
     84         ; thresholding
     85         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
     86         psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
     87         psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
     88         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
     89         pcmpgtw     xmm6,       xmm2
     90         por         xmm7,       xmm6                    ; accumulate thresholds
     91 
     92 
     93         neg         rax
     94         movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
     95         punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
     96         paddusw     xmm3,       xmm5                    ; mm3 += mm5
     97 
     98         ; thresholding
     99         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
    100         psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
    101         psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
    102         paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
    103         pcmpgtw     xmm6,       xmm2
    104         por         xmm7,       xmm6                    ; accumulate thresholds
    105 
    106         movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
    107         punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
    108         paddusw     xmm3,       xmm4                    ; mm3 += mm5
    109 
    110         ; thresholding
    111         movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
    112         psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
    113         psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
    114         paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
    115         pcmpgtw     xmm6,       xmm2
    116         por         xmm7,       xmm6                    ; accumulate thresholds
    117 
    118 
    119         paddusw     xmm3,       RD42                    ; mm3 += round value
    120         psraw       xmm3,       3                       ; mm3 /= 8
    121 
    122         pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
    123         pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
    124         paddusw     xmm1,       xmm7                    ; combination
    125 
    126         packuswb    xmm1,       xmm0                    ; pack to bytes
    127         movq        QWORD PTR [rdi], xmm1             ;
    128 
    129         neg         rax                   ; pitch is positive
    130         add         rsi,        8
    131         add         rdi,        8
    132 
    133         add         rdx,        8
    134         cmp         edx,        dword arg(5) ;cols
    135 
    136         jl          nextcol
    137 
    138         ; done with the all cols, start the across filtering in place
    139         sub         rsi,        rdx
    140         sub         rdi,        rdx
    141 
    142         xor         rdx,        rdx
    143         movq        mm0,        QWORD PTR [rdi-8];
    144 
    145 acrossnextcol:
    146         movq        xmm7,       QWORD PTR [rdi +rdx -2]
    147         movd        xmm4,       DWORD PTR [rdi +rdx +6]
    148 
    149         pslldq      xmm4,       8
    150         por         xmm4,       xmm7
    151 
    152         movdqa      xmm3,       xmm4
    153         psrldq      xmm3,       2
    154         punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
    155         movdqa      xmm1,       xmm3              ; mm1 = p0..p3
    156         psllw       xmm3,       2
    157 
    158 
    159         movdqa      xmm5,       xmm4
    160         psrldq      xmm5,       3
    161         punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
    162         paddusw     xmm3,       xmm5              ; mm3 += mm6
    163 
    164         ; thresholding
    165         movdqa      xmm7,       xmm1              ; mm7 = p0..p3
    166         psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
    167         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
    168         paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
    169         pcmpgtw     xmm7,       xmm2
    170 
    171         movdqa      xmm5,       xmm4
    172         psrldq      xmm5,       4
    173         punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
    174         paddusw     xmm3,       xmm5              ; mm3 += mm5
    175 
    176         ; thresholding
    177         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
    178         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
    179         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
    180         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
    181         pcmpgtw     xmm6,       xmm2
    182         por         xmm7,       xmm6              ; accumulate thresholds
    183 
    184 
    185         movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
    186         punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
    187         paddusw     xmm3,       xmm5              ; mm3 += mm5
    188 
    189         ; thresholding
    190         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
    191         psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
    192         psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
    193         paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
    194         pcmpgtw     xmm6,       xmm2
    195         por         xmm7,       xmm6              ; accumulate thresholds
    196 
    197         psrldq      xmm4,       1                   ; mm4 = p-1..p5
    198         punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
    199         paddusw     xmm3,       xmm4              ; mm3 += mm5
    200 
    201         ; thresholding
    202         movdqa      xmm6,       xmm1              ; mm6 = p0..p3
    203         psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
    204         psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
    205         paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
    206         pcmpgtw     xmm6,       xmm2
    207         por         xmm7,       xmm6              ; accumulate thresholds
    208 
    209         paddusw     xmm3,       RD42              ; mm3 += round value
    210         psraw       xmm3,       3                 ; mm3 /= 8
    211 
    212         pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
    213         pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
    214         paddusw     xmm1,       xmm7              ; combination
    215 
    216         packuswb    xmm1,       xmm0              ; pack to bytes
    217         movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
    218         movdq2q     mm0,        xmm1
    219 
    220         add         rdx,        8
    221         cmp         edx,        dword arg(5) ;cols
    222         jl          acrossnextcol;
    223 
    224         ; last 8 pixels
    225         movq        QWORD PTR [rdi+rdx-8],  mm0
    226 
    227         ; done with this rwo
    228         add         rsi,rax               ; next line
    229         mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
    230         add         rdi,rax               ; next destination
    231         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
    232 
    233         dec         rcx                   ; decrement count
    234         jnz         nextrow               ; next row
    235 
    236 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    237     add rsp,16
    238     pop rsp
    239 %endif
    240     ; begin epilog
    241     pop rdi
    242     pop rsi
    243     RESTORE_GOT
    244     RESTORE_XMM
    245     UNSHADOW_ARGS
    246     pop         rbp
    247     ret
    248 %undef RD42
    249 
    250 
    251 ;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
    252 ;                            int pitch, int rows, int cols,int flimit)
    253 extern sym(vp8_rv)
    254 global sym(vp8_mbpost_proc_down_xmm)
    255 sym(vp8_mbpost_proc_down_xmm):
    256     push        rbp
    257     mov         rbp, rsp
    258     SHADOW_ARGS_TO_STACK 5
    259     SAVE_XMM
    260     GET_GOT     rbx
    261     push        rsi
    262     push        rdi
    263     ; end prolog
    264 
    265     ALIGN_STACK 16, rax
    266     sub         rsp, 128+16
    267 
    268     ; unsigned char d[16][8] at [rsp]
    269     ; create flimit2 at [rsp+128]
    270     mov         eax, dword ptr arg(4) ;flimit
    271     mov         [rsp+128], eax
    272     mov         [rsp+128+4], eax
    273     mov         [rsp+128+8], eax
    274     mov         [rsp+128+12], eax
    275 %define flimit4 [rsp+128]
    276 
    277 %if ABI_IS_32BIT=0
    278     lea         r8,       [GLOBAL(sym(vp8_rv))]
    279 %endif
    280 
    281     ;rows +=8;
    282     add         dword arg(2), 8
    283 
    284     ;for(c=0; c<cols; c+=8)
    285 loop_col:
    286             mov         rsi,        arg(0) ; s
    287             pxor        xmm0,       xmm0        ;
    288 
    289             movsxd      rax,        dword ptr arg(1) ;pitch       ;
    290             neg         rax                                     ; rax = -pitch
    291 
    292             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
    293             neg         rax
    294 
    295 
    296             pxor        xmm5,       xmm5
    297             pxor        xmm6,       xmm6        ;
    298 
    299             pxor        xmm7,       xmm7        ;
    300             mov         rdi,        rsi
    301 
    302             mov         rcx,        15          ;
    303 
    304 loop_initvar:
    305             movq        xmm1,       QWORD PTR [rdi];
    306             punpcklbw   xmm1,       xmm0        ;
    307 
    308             paddw       xmm5,       xmm1        ;
    309             pmullw      xmm1,       xmm1        ;
    310 
    311             movdqa      xmm2,       xmm1        ;
    312             punpcklwd   xmm1,       xmm0        ;
    313 
    314             punpckhwd   xmm2,       xmm0        ;
    315             paddd       xmm6,       xmm1        ;
    316 
    317             paddd       xmm7,       xmm2        ;
    318             lea         rdi,        [rdi+rax]   ;
    319 
    320             dec         rcx
    321             jne         loop_initvar
    322             ;save the var and sum
    323             xor         rdx,        rdx
    324 loop_row:
    325             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
    326             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
    327 
    328             punpcklbw   xmm1,       xmm0
    329             punpcklbw   xmm2,       xmm0
    330 
    331             paddw       xmm5,       xmm2
    332             psubw       xmm5,       xmm1
    333 
    334             pmullw      xmm2,       xmm2
    335             movdqa      xmm4,       xmm2
    336 
    337             punpcklwd   xmm2,       xmm0
    338             punpckhwd   xmm4,       xmm0
    339 
    340             paddd       xmm6,       xmm2
    341             paddd       xmm7,       xmm4
    342 
    343             pmullw      xmm1,       xmm1
    344             movdqa      xmm2,       xmm1
    345 
    346             punpcklwd   xmm1,       xmm0
    347             psubd       xmm6,       xmm1
    348 
    349             punpckhwd   xmm2,       xmm0
    350             psubd       xmm7,       xmm2
    351 
    352 
    353             movdqa      xmm3,       xmm6
    354             pslld       xmm3,       4
    355 
    356             psubd       xmm3,       xmm6
    357             movdqa      xmm1,       xmm5
    358 
    359             movdqa      xmm4,       xmm5
    360             pmullw      xmm1,       xmm1
    361 
    362             pmulhw      xmm4,       xmm4
    363             movdqa      xmm2,       xmm1
    364 
    365             punpcklwd   xmm1,       xmm4
    366             punpckhwd   xmm2,       xmm4
    367 
    368             movdqa      xmm4,       xmm7
    369             pslld       xmm4,       4
    370 
    371             psubd       xmm4,       xmm7
    372 
    373             psubd       xmm3,       xmm1
    374             psubd       xmm4,       xmm2
    375 
    376             psubd       xmm3,       flimit4
    377             psubd       xmm4,       flimit4
    378 
    379             psrad       xmm3,       31
    380             psrad       xmm4,       31
    381 
    382             packssdw    xmm3,       xmm4
    383             packsswb    xmm3,       xmm0
    384 
    385             movq        xmm1,       QWORD PTR [rsi+rax*8]
    386 
    387             movq        xmm2,       xmm1
    388             punpcklbw   xmm1,       xmm0
    389 
    390             paddw       xmm1,       xmm5
    391             mov         rcx,        rdx
    392 
    393             and         rcx,        127
    394 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    395             push        rax
    396             lea         rax,        [GLOBAL(sym(vp8_rv))]
    397             movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
    398             pop         rax
    399 %elif ABI_IS_32BIT=0
    400             movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
    401 %else
    402             movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
    403 %endif
    404 
    405             paddw       xmm1,       xmm4
    406             ;paddw     xmm1,       eight8s
    407             psraw       xmm1,       4
    408 
    409             packuswb    xmm1,       xmm0
    410             pand        xmm1,       xmm3
    411 
    412             pandn       xmm3,       xmm2
    413             por         xmm1,       xmm3
    414 
    415             and         rcx,        15
    416             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
    417 
    418             mov         rcx,        rdx
    419             sub         rcx,        8
    420 
    421             and         rcx,        15
    422             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
    423 
    424             movq        [rsi],      mm0
    425             lea         rsi,        [rsi+rax]
    426 
    427             lea         rdi,        [rdi+rax]
    428             add         rdx,        1
    429 
    430             cmp         edx,        dword arg(2) ;rows
    431             jl          loop_row
    432 
    433         add         dword arg(0), 8 ; s += 8
    434         sub         dword arg(3), 8 ; cols -= 8
    435         cmp         dword arg(3), 0
    436         jg          loop_col
    437 
    438     add         rsp, 128+16
    439     pop         rsp
    440 
    441     ; begin epilog
    442     pop rdi
    443     pop rsi
    444     RESTORE_GOT
    445     RESTORE_XMM
    446     UNSHADOW_ARGS
    447     pop         rbp
    448     ret
    449 %undef flimit4
    450 
    451 
    452 ;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
    453 ;                                int pitch, int rows, int cols,int flimit)
    454 global sym(vp8_mbpost_proc_across_ip_xmm)
    455 sym(vp8_mbpost_proc_across_ip_xmm):
    456     push        rbp
    457     mov         rbp, rsp
    458     SHADOW_ARGS_TO_STACK 5
    459     SAVE_XMM
    460     GET_GOT     rbx
    461     push        rsi
    462     push        rdi
    463     ; end prolog
    464 
    465     ALIGN_STACK 16, rax
    466     sub         rsp, 16
    467 
    468     ; create flimit4 at [rsp]
    469     mov         eax, dword ptr arg(4) ;flimit
    470     mov         [rsp], eax
    471     mov         [rsp+4], eax
    472     mov         [rsp+8], eax
    473     mov         [rsp+12], eax
    474 %define flimit4 [rsp]
    475 
    476 
    477     ;for(r=0;r<rows;r++)
    478 ip_row_loop:
    479 
    480         xor         rdx,    rdx ;sumsq=0;
    481         xor         rcx,    rcx ;sum=0;
    482         mov         rsi,    arg(0); s
    483         mov         rdi,    -8
    484 ip_var_loop:
    485         ;for(i=-8;i<=6;i++)
    486         ;{
    487         ;    sumsq += s[i]*s[i];
    488         ;    sum   += s[i];
    489         ;}
    490         movzx       eax, byte [rsi+rdi]
    491         add         ecx, eax
    492         mul         al
    493         add         edx, eax
    494         add         rdi, 1
    495         cmp         rdi, 6
    496         jle         ip_var_loop
    497 
    498 
    499             ;mov         rax,    sumsq
    500             ;movd        xmm7,   rax
    501             movd        xmm7,   edx
    502 
    503             ;mov         rax,    sum
    504             ;movd        xmm6,   rax
    505             movd        xmm6,   ecx
    506 
    507             mov         rsi,    arg(0) ;s
    508             xor         rcx,    rcx
    509 
    510             movsxd      rdx,    dword arg(3) ;cols
    511             add         rdx,    8
    512             pxor        mm0,    mm0
    513             pxor        mm1,    mm1
    514 
    515             pxor        xmm0,   xmm0
    516 nextcol4:
    517 
    518             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
    519             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
    520 
    521             punpcklbw   xmm1,   xmm0                    ; expanding
    522             punpcklbw   xmm2,   xmm0                    ; expanding
    523 
    524             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
    525             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
    526 
    527             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
    528             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
    529 
    530             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
    531             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
    532 
    533             paddd       xmm6,   xmm2
    534             paddd       xmm7,   xmm1
    535 
    536             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
    537             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
    538 
    539             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
    540             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
    541 
    542             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
    543             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
    544 
    545             paddd       xmm6,   xmm4
    546             paddd       xmm7,   xmm3
    547 
    548             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
    549             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
    550 
    551             paddd       xmm7,   xmm3
    552             paddd       xmm6,   xmm4
    553 
    554             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
    555             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
    556 
    557             paddd       xmm7,   xmm3
    558             paddd       xmm6,   xmm4
    559 
    560             movdqa      xmm3,   xmm6
    561             pmaddwd     xmm3,   xmm3
    562 
    563             movdqa      xmm5,   xmm7
    564             pslld       xmm5,   4
    565 
    566             psubd       xmm5,   xmm7
    567             psubd       xmm5,   xmm3
    568 
    569             psubd       xmm5,   flimit4
    570             psrad       xmm5,   31
    571 
    572             packssdw    xmm5,   xmm0
    573             packsswb    xmm5,   xmm0
    574 
    575             movd        xmm1,   DWORD PTR [rsi+rcx]
    576             movq        xmm2,   xmm1
    577 
    578             punpcklbw   xmm1,   xmm0
    579             punpcklwd   xmm1,   xmm0
    580 
    581             paddd       xmm1,   xmm6
    582             paddd       xmm1,   [GLOBAL(four8s)]
    583 
    584             psrad       xmm1,   4
    585             packssdw    xmm1,   xmm0
    586 
    587             packuswb    xmm1,   xmm0
    588             pand        xmm1,   xmm5
    589 
    590             pandn       xmm5,   xmm2
    591             por         xmm5,   xmm1
    592 
    593             movd        [rsi+rcx-8],  mm0
    594             movq        mm0,    mm1
    595 
    596             movdq2q     mm1,    xmm5
    597             psrldq      xmm7,   12
    598 
    599             psrldq      xmm6,   12
    600             add         rcx,    4
    601 
    602             cmp         rcx,    rdx
    603             jl          nextcol4
    604 
    605         ;s+=pitch;
    606         movsxd rax, dword arg(1)
    607         add    arg(0), rax
    608 
    609         sub dword arg(2), 1 ;rows-=1
    610         cmp dword arg(2), 0
    611         jg ip_row_loop
    612 
    613     add         rsp, 16
    614     pop         rsp
    615 
    616     ; begin epilog
    617     pop rdi
    618     pop rsi
    619     RESTORE_GOT
    620     RESTORE_XMM
    621     UNSHADOW_ARGS
    622     pop         rbp
    623     ret
    624 %undef flimit4
    625 
    626 
    627 ;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
    628 ;                            unsigned char blackclamp[16],
    629 ;                            unsigned char whiteclamp[16],
    630 ;                            unsigned char bothclamp[16],
    631 ;                            unsigned int Width, unsigned int Height, int Pitch)
    632 extern sym(rand)
    633 global sym(vp8_plane_add_noise_wmt)
    634 sym(vp8_plane_add_noise_wmt):
    635     push        rbp
    636     mov         rbp, rsp
    637     SHADOW_ARGS_TO_STACK 8
    638     GET_GOT     rbx
    639     push        rsi
    640     push        rdi
    641     ; end prolog
    642 
    643 addnoise_loop:
    644     call sym(rand) WRT_PLT
    645     mov     rcx, arg(1) ;noise
    646     and     rax, 0xff
    647     add     rcx, rax
    648 
    649     ; we rely on the fact that the clamping vectors are stored contiguously
    650     ; in black/white/both order. Note that we have to reload this here because
    651     ; rdx could be trashed by rand()
    652     mov     rdx, arg(2) ; blackclamp
    653 
    654 
    655             mov     rdi, rcx
    656             movsxd  rcx, dword arg(5) ;[Width]
    657             mov     rsi, arg(0) ;Pos
    658             xor         rax,rax
    659 
    660 addnoise_nextset:
    661             movdqu      xmm1,[rsi+rax]         ; get the source
    662 
    663             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
    664             paddusb     xmm1, [rdx+32] ;bothclamp
    665             psubusb     xmm1, [rdx+16] ;whiteclamp
    666 
    667             movdqu      xmm2,[rdi+rax]         ; get the noise for this line
    668             paddb       xmm1,xmm2              ; add it in
    669             movdqu      [rsi+rax],xmm1         ; store the result
    670 
    671             add         rax,16                 ; move to the next line
    672 
    673             cmp         rax, rcx
    674             jl          addnoise_nextset
    675 
    676     movsxd  rax, dword arg(7) ; Pitch
    677     add     arg(0), rax ; Start += Pitch
    678     sub     dword arg(6), 1   ; Height -= 1
    679     jg      addnoise_loop
    680 
    681     ; begin epilog
    682     pop rdi
    683     pop rsi
    684     RESTORE_GOT
    685     UNSHADOW_ARGS
    686     pop         rbp
    687     ret
    688 
    689 
    690 SECTION_RODATA
    691 align 16
    692 rd42:
    693     times 8 dw 0x04
    694 four8s:
    695     times 4 dd 8
    696