Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;void copy_mem16x16_sse2(
     15 ;    unsigned char *src,
     16 ;    int src_stride,
     17 ;    unsigned char *dst,
     18 ;    int dst_stride
     19 ;    )
     20 global sym(vp8_copy_mem16x16_sse2) PRIVATE
     21 sym(vp8_copy_mem16x16_sse2):
     22     push        rbp
     23     mov         rbp, rsp
     24     SHADOW_ARGS_TO_STACK 4
     25     push        rsi
     26     push        rdi
     27     ; end prolog
     28 
     29         mov         rsi,        arg(0) ;src;
     30         movdqu      xmm0,       [rsi]
     31 
     32         movsxd      rax,        dword ptr arg(1) ;src_stride;
     33         mov         rdi,        arg(2) ;dst;
     34 
     35         movdqu      xmm1,       [rsi+rax]
     36         movdqu      xmm2,       [rsi+rax*2]
     37 
     38         movsxd      rcx,        dword ptr arg(3) ;dst_stride
     39         lea         rsi,        [rsi+rax*2]
     40 
     41         movdqa      [rdi],      xmm0
     42         add         rsi,        rax
     43 
     44         movdqa      [rdi+rcx],  xmm1
     45         movdqa      [rdi+rcx*2],xmm2
     46 
     47         lea         rdi,        [rdi+rcx*2]
     48         movdqu      xmm3,       [rsi]
     49 
     50         add         rdi,        rcx
     51         movdqu      xmm4,       [rsi+rax]
     52 
     53         movdqu      xmm5,       [rsi+rax*2]
     54         lea         rsi,        [rsi+rax*2]
     55 
     56         movdqa      [rdi],  xmm3
     57         add         rsi,        rax
     58 
     59         movdqa      [rdi+rcx],  xmm4
     60         movdqa      [rdi+rcx*2],xmm5
     61 
     62         lea         rdi,        [rdi+rcx*2]
     63         movdqu      xmm0,       [rsi]
     64 
     65         add         rdi,        rcx
     66         movdqu      xmm1,       [rsi+rax]
     67 
     68         movdqu      xmm2,       [rsi+rax*2]
     69         lea         rsi,        [rsi+rax*2]
     70 
     71         movdqa      [rdi],      xmm0
     72         add         rsi,        rax
     73 
     74         movdqa      [rdi+rcx],  xmm1
     75 
     76         movdqa      [rdi+rcx*2],    xmm2
     77         movdqu      xmm3,       [rsi]
     78 
     79         movdqu      xmm4,       [rsi+rax]
     80         lea         rdi,        [rdi+rcx*2]
     81 
     82         add         rdi,        rcx
     83         movdqu      xmm5,       [rsi+rax*2]
     84 
     85         lea         rsi,        [rsi+rax*2]
     86         movdqa      [rdi],  xmm3
     87 
     88         add         rsi,        rax
     89         movdqa      [rdi+rcx],  xmm4
     90 
     91         movdqa      [rdi+rcx*2],xmm5
     92         movdqu      xmm0,       [rsi]
     93 
     94         lea         rdi,        [rdi+rcx*2]
     95         movdqu      xmm1,       [rsi+rax]
     96 
     97         add         rdi,        rcx
     98         movdqu      xmm2,       [rsi+rax*2]
     99 
    100         lea         rsi,        [rsi+rax*2]
    101         movdqa      [rdi],      xmm0
    102 
    103         movdqa      [rdi+rcx],  xmm1
    104         movdqa      [rdi+rcx*2],xmm2
    105 
    106         movdqu      xmm3,       [rsi+rax]
    107         lea         rdi,        [rdi+rcx*2]
    108 
    109         movdqa      [rdi+rcx],  xmm3
    110 
    111     ; begin epilog
    112     pop rdi
    113     pop rsi
    114     UNSHADOW_ARGS
    115     pop         rbp
    116     ret
    117 
    118 
    119 ;void vp8_intra_pred_uv_dc_mmx2(
    120 ;    unsigned char *dst,
    121 ;    int dst_stride
    122 ;    unsigned char *above,
    123 ;    unsigned char *left,
    124 ;    int left_stride,
    125 ;    )
    126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
    127 sym(vp8_intra_pred_uv_dc_mmx2):
    128     push        rbp
    129     mov         rbp, rsp
    130     SHADOW_ARGS_TO_STACK 5
    131     push        rsi
    132     push        rdi
    133     ; end prolog
    134 
    135     ; from top
    136     mov         rdi,        arg(2) ;above;
    137     mov         rsi,        arg(3) ;left;
    138     movsxd      rax,        dword ptr arg(4) ;left_stride;
    139     pxor        mm0,        mm0
    140     movq        mm1,        [rdi]
    141     lea         rdi,        [rax*3]
    142     psadbw      mm1,        mm0
    143     ; from left
    144     movzx       ecx,        byte [rsi]
    145     movzx       edx,        byte [rsi+rax*1]
    146     add         ecx,        edx
    147     movzx       edx,        byte [rsi+rax*2]
    148     add         ecx,        edx
    149 
    150     movzx       edx,        byte [rsi+rdi]
    151     lea         rsi,        [rsi+rax*4]
    152     add         ecx,        edx
    153     movzx       edx,        byte [rsi]
    154     add         ecx,        edx
    155     movzx       edx,        byte [rsi+rax]
    156     add         ecx,        edx
    157     movzx       edx,        byte [rsi+rax*2]
    158     add         ecx,        edx
    159     movzx       edx,        byte [rsi+rdi]
    160     add         ecx,        edx
    161 
    162     ; add up
    163     pextrw      edx,        mm1, 0x0
    164     lea         edx,        [edx+ecx+8]
    165     sar         edx,        4
    166     movd        mm1,        edx
    167     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    168     pshufw      mm1,        mm1, 0x0
    169     mov         rdi,        arg(0) ;dst;
    170     packuswb    mm1,        mm1
    171 
    172     ; write out
    173     lea         rax,        [rcx*3]
    174     lea         rdx,        [rdi+rcx*4]
    175 
    176     movq [rdi      ],       mm1
    177     movq [rdi+rcx  ],       mm1
    178     movq [rdi+rcx*2],       mm1
    179     movq [rdi+rax  ],       mm1
    180     movq [rdx      ],       mm1
    181     movq [rdx+rcx  ],       mm1
    182     movq [rdx+rcx*2],       mm1
    183     movq [rdx+rax  ],       mm1
    184 
    185     ; begin epilog
    186     pop         rdi
    187     pop         rsi
    188     UNSHADOW_ARGS
    189     pop         rbp
    190     ret
    191 
    192 ;void vp8_intra_pred_uv_dctop_mmx2(
    193 ;    unsigned char *dst,
    194 ;    int dst_stride
    195 ;    unsigned char *above,
    196 ;    unsigned char *left,
    197 ;    int left_stride,
    198 ;    )
    199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
    200 sym(vp8_intra_pred_uv_dctop_mmx2):
    201     push        rbp
    202     mov         rbp, rsp
    203     SHADOW_ARGS_TO_STACK 5
    204     GET_GOT     rbx
    205     push        rsi
    206     push        rdi
    207     ; end prolog
    208 
    209     ;arg(3), arg(4) not used
    210 
    211     ; from top
    212     mov         rsi,        arg(2) ;above;
    213     pxor        mm0,        mm0
    214     movq        mm1,        [rsi]
    215     psadbw      mm1,        mm0
    216 
    217     ; add up
    218     paddw       mm1,        [GLOBAL(dc_4)]
    219     psraw       mm1,        3
    220     pshufw      mm1,        mm1, 0x0
    221     packuswb    mm1,        mm1
    222 
    223     ; write out
    224     mov         rdi,        arg(0) ;dst;
    225     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    226     lea         rax,        [rcx*3]
    227 
    228     movq [rdi      ],       mm1
    229     movq [rdi+rcx  ],       mm1
    230     movq [rdi+rcx*2],       mm1
    231     movq [rdi+rax  ],       mm1
    232     lea         rdi,        [rdi+rcx*4]
    233     movq [rdi      ],       mm1
    234     movq [rdi+rcx  ],       mm1
    235     movq [rdi+rcx*2],       mm1
    236     movq [rdi+rax  ],       mm1
    237 
    238     ; begin epilog
    239     pop         rdi
    240     pop         rsi
    241     RESTORE_GOT
    242     UNSHADOW_ARGS
    243     pop         rbp
    244     ret
    245 
    246 ;void vp8_intra_pred_uv_dcleft_mmx2(
    247 ;    unsigned char *dst,
    248 ;    int dst_stride
    249 ;    unsigned char *above,
    250 ;    unsigned char *left,
    251 ;    int left_stride,
    252 ;    )
    253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
    254 sym(vp8_intra_pred_uv_dcleft_mmx2):
    255     push        rbp
    256     mov         rbp, rsp
    257     SHADOW_ARGS_TO_STACK 5
    258     push        rsi
    259     push        rdi
    260     ; end prolog
    261 
    262     ;arg(2) not used
    263 
    264     ; from left
    265     mov         rsi,        arg(3) ;left;
    266     movsxd      rax,        dword ptr arg(4) ;left_stride;
    267     lea         rdi,        [rax*3]
    268     movzx       ecx,        byte [rsi]
    269     movzx       edx,        byte [rsi+rax]
    270     add         ecx,        edx
    271     movzx       edx,        byte [rsi+rax*2]
    272     add         ecx,        edx
    273     movzx       edx,        byte [rsi+rdi]
    274     add         ecx,        edx
    275     lea         rsi,        [rsi+rax*4]
    276     movzx       edx,        byte [rsi]
    277     add         ecx,        edx
    278     movzx       edx,        byte [rsi+rax]
    279     add         ecx,        edx
    280     movzx       edx,        byte [rsi+rax*2]
    281     add         ecx,        edx
    282     movzx       edx,        byte [rsi+rdi]
    283     lea         edx,        [ecx+edx+4]
    284 
    285     ; add up
    286     shr         edx,        3
    287     movd        mm1,        edx
    288     pshufw      mm1,        mm1, 0x0
    289     packuswb    mm1,        mm1
    290 
    291     ; write out
    292     mov         rdi,        arg(0) ;dst;
    293     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    294     lea         rax,        [rcx*3]
    295 
    296     movq [rdi      ],       mm1
    297     movq [rdi+rcx  ],       mm1
    298     movq [rdi+rcx*2],       mm1
    299     movq [rdi+rax  ],       mm1
    300     lea         rdi,        [rdi+rcx*4]
    301     movq [rdi      ],       mm1
    302     movq [rdi+rcx  ],       mm1
    303     movq [rdi+rcx*2],       mm1
    304     movq [rdi+rax  ],       mm1
    305 
    306     ; begin epilog
    307     pop         rdi
    308     pop         rsi
    309     UNSHADOW_ARGS
    310     pop         rbp
    311     ret
    312 
    313 ;void vp8_intra_pred_uv_dc128_mmx(
    314 ;    unsigned char *dst,
    315 ;    int dst_stride
    316 ;    unsigned char *above,
    317 ;    unsigned char *left,
    318 ;    int left_stride,
    319 ;    )
    320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
    321 sym(vp8_intra_pred_uv_dc128_mmx):
    322     push        rbp
    323     mov         rbp, rsp
    324     SHADOW_ARGS_TO_STACK 5
    325     GET_GOT     rbx
    326     ; end prolog
    327 
    328     ;arg(2), arg(3), arg(4) not used
    329 
    330     ; write out
    331     movq        mm1,        [GLOBAL(dc_128)]
    332     mov         rax,        arg(0) ;dst;
    333     movsxd      rdx,        dword ptr arg(1) ;dst_stride
    334     lea         rcx,        [rdx*3]
    335 
    336     movq [rax      ],       mm1
    337     movq [rax+rdx  ],       mm1
    338     movq [rax+rdx*2],       mm1
    339     movq [rax+rcx  ],       mm1
    340     lea         rax,        [rax+rdx*4]
    341     movq [rax      ],       mm1
    342     movq [rax+rdx  ],       mm1
    343     movq [rax+rdx*2],       mm1
    344     movq [rax+rcx  ],       mm1
    345 
    346     ; begin epilog
    347     RESTORE_GOT
    348     UNSHADOW_ARGS
    349     pop         rbp
    350     ret
    351 
    352 ;void vp8_intra_pred_uv_tm_sse2(
    353 ;    unsigned char *dst,
    354 ;    int dst_stride
    355 ;    unsigned char *above,
    356 ;    unsigned char *left,
    357 ;    int left_stride,
    358 ;    )
    359 %macro vp8_intra_pred_uv_tm 1
    360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
    361 sym(vp8_intra_pred_uv_tm_%1):
    362     push        rbp
    363     mov         rbp, rsp
    364     SHADOW_ARGS_TO_STACK 5
    365     GET_GOT     rbx
    366     push        rsi
    367     push        rdi
    368     ; end prolog
    369 
    370     ; read top row
    371     mov         edx,        4
    372     mov         rsi,        arg(2) ;above
    373     movsxd      rax,        dword ptr arg(4) ;left_stride;
    374     pxor        xmm0,       xmm0
    375 %ifidn %1, ssse3
    376     movdqa      xmm2,       [GLOBAL(dc_1024)]
    377 %endif
    378     movq        xmm1,       [rsi]
    379     punpcklbw   xmm1,       xmm0
    380 
    381     ; set up left ptrs ans subtract topleft
    382     movd        xmm3,       [rsi-1]
    383     mov         rsi,        arg(3) ;left;
    384 %ifidn %1, sse2
    385     punpcklbw   xmm3,       xmm0
    386     pshuflw     xmm3,       xmm3, 0x0
    387     punpcklqdq  xmm3,       xmm3
    388 %else
    389     pshufb      xmm3,       xmm2
    390 %endif
    391     psubw       xmm1,       xmm3
    392 
    393     ; set up dest ptrs
    394     mov         rdi,        arg(0) ;dst;
    395     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    396 
    397 .vp8_intra_pred_uv_tm_%1_loop:
    398     movd        xmm3,       [rsi]
    399     movd        xmm5,       [rsi+rax]
    400 %ifidn %1, sse2
    401     punpcklbw   xmm3,       xmm0
    402     punpcklbw   xmm5,       xmm0
    403     pshuflw     xmm3,       xmm3, 0x0
    404     pshuflw     xmm5,       xmm5, 0x0
    405     punpcklqdq  xmm3,       xmm3
    406     punpcklqdq  xmm5,       xmm5
    407 %else
    408     pshufb      xmm3,       xmm2
    409     pshufb      xmm5,       xmm2
    410 %endif
    411     paddw       xmm3,       xmm1
    412     paddw       xmm5,       xmm1
    413     packuswb    xmm3,       xmm5
    414     movq  [rdi    ],        xmm3
    415     movhps[rdi+rcx],        xmm3
    416     lea         rsi,        [rsi+rax*2]
    417     lea         rdi,        [rdi+rcx*2]
    418     dec         edx
    419     jnz .vp8_intra_pred_uv_tm_%1_loop
    420 
    421     ; begin epilog
    422     pop         rdi
    423     pop         rsi
    424     RESTORE_GOT
    425     UNSHADOW_ARGS
    426     pop         rbp
    427     ret
    428 %endmacro
    429 
    430 vp8_intra_pred_uv_tm sse2
    431 vp8_intra_pred_uv_tm ssse3
    432 
    433 ;void vp8_intra_pred_uv_ve_mmx(
    434 ;    unsigned char *dst,
    435 ;    int dst_stride
    436 ;    unsigned char *above,
    437 ;    unsigned char *left,
    438 ;    int left_stride,
    439 ;    )
    440 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
    441 sym(vp8_intra_pred_uv_ve_mmx):
    442     push        rbp
    443     mov         rbp, rsp
    444     SHADOW_ARGS_TO_STACK 5
    445     ; end prolog
    446 
    447     ; arg(3), arg(4) not used
    448 
    449     ; read from top
    450     mov         rax,        arg(2) ;src;
    451 
    452     movq        mm1,        [rax]
    453 
    454     ; write out
    455     mov         rax,        arg(0) ;dst;
    456     movsxd      rdx,        dword ptr arg(1) ;dst_stride
    457     lea         rcx,        [rdx*3]
    458 
    459     movq [rax      ],       mm1
    460     movq [rax+rdx  ],       mm1
    461     movq [rax+rdx*2],       mm1
    462     movq [rax+rcx  ],       mm1
    463     lea         rax,        [rax+rdx*4]
    464     movq [rax      ],       mm1
    465     movq [rax+rdx  ],       mm1
    466     movq [rax+rdx*2],       mm1
    467     movq [rax+rcx  ],       mm1
    468 
    469     ; begin epilog
    470     UNSHADOW_ARGS
    471     pop         rbp
    472     ret
    473 
    474 ;void vp8_intra_pred_uv_ho_mmx2(
    475 ;    unsigned char *dst,
    476 ;    int dst_stride
    477 ;    unsigned char *above,
    478 ;    unsigned char *left,
    479 ;    int left_stride
    480 ;    )
    481 %macro vp8_intra_pred_uv_ho 1
    482 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
    483 sym(vp8_intra_pred_uv_ho_%1):
    484     push        rbp
    485     mov         rbp, rsp
    486     SHADOW_ARGS_TO_STACK 5
    487     push        rsi
    488     push        rdi
    489 %ifidn %1, ssse3
    490 %ifndef GET_GOT_SAVE_ARG
    491     push        rbx
    492 %endif
    493     GET_GOT     rbx
    494 %endif
    495     ; end prolog
    496 
    497     ;arg(2) not used
    498 
    499     ; read from left and write out
    500 %ifidn %1, mmx2
    501     mov         edx,        4
    502 %endif
    503     mov         rsi,        arg(3) ;left
    504     movsxd      rax,        dword ptr arg(4) ;left_stride;
    505     mov         rdi,        arg(0) ;dst;
    506     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    507 %ifidn %1, ssse3
    508     lea         rdx,        [rcx*3]
    509     movdqa      xmm2,       [GLOBAL(dc_00001111)]
    510     lea         rbx,        [rax*3]
    511 %endif
    512 
    513 %ifidn %1, mmx2
    514 .vp8_intra_pred_uv_ho_%1_loop:
    515     movd        mm0,        [rsi]
    516     movd        mm1,        [rsi+rax]
    517     punpcklbw   mm0,        mm0
    518     punpcklbw   mm1,        mm1
    519     pshufw      mm0,        mm0, 0x0
    520     pshufw      mm1,        mm1, 0x0
    521     movq  [rdi    ],        mm0
    522     movq  [rdi+rcx],        mm1
    523     lea         rsi,        [rsi+rax*2]
    524     lea         rdi,        [rdi+rcx*2]
    525     dec         edx
    526     jnz .vp8_intra_pred_uv_ho_%1_loop
    527 %else
    528     movd        xmm0,       [rsi]
    529     movd        xmm3,       [rsi+rax]
    530     movd        xmm1,       [rsi+rax*2]
    531     movd        xmm4,       [rsi+rbx]
    532     punpcklbw   xmm0,       xmm3
    533     punpcklbw   xmm1,       xmm4
    534     pshufb      xmm0,       xmm2
    535     pshufb      xmm1,       xmm2
    536     movq   [rdi    ],       xmm0
    537     movhps [rdi+rcx],       xmm0
    538     movq [rdi+rcx*2],       xmm1
    539     movhps [rdi+rdx],       xmm1
    540     lea         rsi,        [rsi+rax*4]
    541     lea         rdi,        [rdi+rcx*4]
    542     movd        xmm0,       [rsi]
    543     movd        xmm3,       [rsi+rax]
    544     movd        xmm1,       [rsi+rax*2]
    545     movd        xmm4,       [rsi+rbx]
    546     punpcklbw   xmm0,       xmm3
    547     punpcklbw   xmm1,       xmm4
    548     pshufb      xmm0,       xmm2
    549     pshufb      xmm1,       xmm2
    550     movq   [rdi    ],       xmm0
    551     movhps [rdi+rcx],       xmm0
    552     movq [rdi+rcx*2],       xmm1
    553     movhps [rdi+rdx],       xmm1
    554 %endif
    555 
    556     ; begin epilog
    557 %ifidn %1, ssse3
    558     RESTORE_GOT
    559 %ifndef GET_GOT_SAVE_ARG
    560     pop         rbx
    561 %endif
    562 %endif
    563     pop         rdi
    564     pop         rsi
    565     UNSHADOW_ARGS
    566     pop         rbp
    567     ret
    568 %endmacro
    569 
    570 vp8_intra_pred_uv_ho mmx2
    571 vp8_intra_pred_uv_ho ssse3
    572 
    573 ;void vp8_intra_pred_y_dc_sse2(
    574 ;    unsigned char *dst,
    575 ;    int dst_stride
    576 ;    unsigned char *above,
    577 ;    unsigned char *left,
    578 ;    int left_stride
    579 ;    )
    580 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
    581 sym(vp8_intra_pred_y_dc_sse2):
    582     push        rbp
    583     mov         rbp, rsp
    584     SHADOW_ARGS_TO_STACK 5
    585     push        rsi
    586     push        rdi
    587     ; end prolog
    588 
    589     ; from top
    590     mov         rdi,        arg(2) ;above
    591     mov         rsi,        arg(3) ;left
    592     movsxd      rax,        dword ptr arg(4) ;left_stride;
    593 
    594     pxor        xmm0,       xmm0
    595     movdqa      xmm1,       [rdi]
    596     psadbw      xmm1,       xmm0
    597     movq        xmm2,       xmm1
    598     punpckhqdq  xmm1,       xmm1
    599     paddw       xmm1,       xmm2
    600 
    601     ; from left
    602     lea         rdi,        [rax*3]
    603 
    604     movzx       ecx,        byte [rsi]
    605     movzx       edx,        byte [rsi+rax]
    606     add         ecx,        edx
    607     movzx       edx,        byte [rsi+rax*2]
    608     add         ecx,        edx
    609     movzx       edx,        byte [rsi+rdi]
    610     add         ecx,        edx
    611     lea         rsi,        [rsi+rax*4]
    612 
    613     movzx       edx,        byte [rsi]
    614     add         ecx,        edx
    615     movzx       edx,        byte [rsi+rax]
    616     add         ecx,        edx
    617     movzx       edx,        byte [rsi+rax*2]
    618     add         ecx,        edx
    619     movzx       edx,        byte [rsi+rdi]
    620     add         ecx,        edx
    621     lea         rsi,        [rsi+rax*4]
    622 
    623     movzx       edx,        byte [rsi]
    624     add         ecx,        edx
    625     movzx       edx,        byte [rsi+rax]
    626     add         ecx,        edx
    627     movzx       edx,        byte [rsi+rax*2]
    628     add         ecx,        edx
    629     movzx       edx,        byte [rsi+rdi]
    630     add         ecx,        edx
    631     lea         rsi,        [rsi+rax*4]
    632 
    633     movzx       edx,        byte [rsi]
    634     add         ecx,        edx
    635     movzx       edx,        byte [rsi+rax]
    636     add         ecx,        edx
    637     movzx       edx,        byte [rsi+rax*2]
    638     add         ecx,        edx
    639     movzx       edx,        byte [rsi+rdi]
    640     add         ecx,        edx
    641 
    642     ; add up
    643     pextrw      edx,        xmm1, 0x0
    644     lea         edx,        [edx+ecx+16]
    645     sar         edx,        5
    646     movd        xmm1,       edx
    647     ; FIXME use pshufb for ssse3 version
    648     pshuflw     xmm1,       xmm1, 0x0
    649     punpcklqdq  xmm1,       xmm1
    650     packuswb    xmm1,       xmm1
    651 
    652     ; write out
    653     mov         rsi,        2
    654     mov         rdi,        arg(0) ;dst;
    655     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    656     lea         rax,        [rcx*3]
    657 
    658 .label
    659     movdqa [rdi      ],     xmm1
    660     movdqa [rdi+rcx  ],     xmm1
    661     movdqa [rdi+rcx*2],     xmm1
    662     movdqa [rdi+rax  ],     xmm1
    663     lea         rdi,        [rdi+rcx*4]
    664     movdqa [rdi      ],     xmm1
    665     movdqa [rdi+rcx  ],     xmm1
    666     movdqa [rdi+rcx*2],     xmm1
    667     movdqa [rdi+rax  ],     xmm1
    668     lea         rdi,        [rdi+rcx*4]
    669     dec         rsi
    670     jnz .label
    671 
    672     ; begin epilog
    673     pop         rdi
    674     pop         rsi
    675     UNSHADOW_ARGS
    676     pop         rbp
    677     ret
    678 
    679 ;void vp8_intra_pred_y_dctop_sse2(
    680 ;    unsigned char *dst,
    681 ;    int dst_stride
    682 ;    unsigned char *above,
    683 ;    unsigned char *left,
    684 ;    int left_stride
    685 ;    )
    686 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
    687 sym(vp8_intra_pred_y_dctop_sse2):
    688     push        rbp
    689     mov         rbp, rsp
    690     SHADOW_ARGS_TO_STACK 5
    691     push        rsi
    692     GET_GOT     rbx
    693     ; end prolog
    694 
    695     ;arg(3), arg(4) not used
    696 
    697     ; from top
    698     mov         rcx,        arg(2) ;above;
    699     pxor        xmm0,       xmm0
    700     movdqa      xmm1,       [rcx]
    701     psadbw      xmm1,       xmm0
    702     movdqa      xmm2,       xmm1
    703     punpckhqdq  xmm1,       xmm1
    704     paddw       xmm1,       xmm2
    705 
    706     ; add up
    707     paddw       xmm1,       [GLOBAL(dc_8)]
    708     psraw       xmm1,       4
    709     ; FIXME use pshufb for ssse3 version
    710     pshuflw     xmm1,       xmm1, 0x0
    711     punpcklqdq  xmm1,       xmm1
    712     packuswb    xmm1,       xmm1
    713 
    714     ; write out
    715     mov         rsi,        2
    716     mov         rdx,        arg(0) ;dst;
    717     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    718     lea         rax,        [rcx*3]
    719 
    720 .label
    721     movdqa [rdx      ],     xmm1
    722     movdqa [rdx+rcx  ],     xmm1
    723     movdqa [rdx+rcx*2],     xmm1
    724     movdqa [rdx+rax  ],     xmm1
    725     lea         rdx,        [rdx+rcx*4]
    726     movdqa [rdx      ],     xmm1
    727     movdqa [rdx+rcx  ],     xmm1
    728     movdqa [rdx+rcx*2],     xmm1
    729     movdqa [rdx+rax  ],     xmm1
    730     lea         rdx,        [rdx+rcx*4]
    731     dec         rsi
    732     jnz .label
    733 
    734     ; begin epilog
    735     RESTORE_GOT
    736     pop         rsi
    737     UNSHADOW_ARGS
    738     pop         rbp
    739     ret
    740 
    741 ;void vp8_intra_pred_y_dcleft_sse2(
    742 ;    unsigned char *dst,
    743 ;    int dst_stride
    744 ;    unsigned char *above,
    745 ;    unsigned char *left,
    746 ;    int left_stride
    747 ;    )
    748 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
    749 sym(vp8_intra_pred_y_dcleft_sse2):
    750     push        rbp
    751     mov         rbp, rsp
    752     SHADOW_ARGS_TO_STACK 5
    753     push        rsi
    754     push        rdi
    755     ; end prolog
    756 
    757     ;arg(2) not used
    758 
    759     ; from left
    760     mov         rsi,        arg(3) ;left;
    761     movsxd      rax,        dword ptr arg(4) ;left_stride;
    762 
    763     lea         rdi,        [rax*3]
    764     movzx       ecx,        byte [rsi]
    765     movzx       edx,        byte [rsi+rax]
    766     add         ecx,        edx
    767     movzx       edx,        byte [rsi+rax*2]
    768     add         ecx,        edx
    769     movzx       edx,        byte [rsi+rdi]
    770     add         ecx,        edx
    771     lea         rsi,        [rsi+rax*4]
    772     movzx       edx,        byte [rsi]
    773     add         ecx,        edx
    774     movzx       edx,        byte [rsi+rax]
    775     add         ecx,        edx
    776     movzx       edx,        byte [rsi+rax*2]
    777     add         ecx,        edx
    778     movzx       edx,        byte [rsi+rdi]
    779     add         ecx,        edx
    780     lea         rsi,        [rsi+rax*4]
    781     movzx       edx,        byte [rsi]
    782     add         ecx,        edx
    783     movzx       edx,        byte [rsi+rax]
    784     add         ecx,        edx
    785     movzx       edx,        byte [rsi+rax*2]
    786     add         ecx,        edx
    787     movzx       edx,        byte [rsi+rdi]
    788     add         ecx,        edx
    789     lea         rsi,        [rsi+rax*4]
    790     movzx       edx,        byte [rsi]
    791     add         ecx,        edx
    792     movzx       edx,        byte [rsi+rax]
    793     add         ecx,        edx
    794     movzx       edx,        byte [rsi+rax*2]
    795     add         ecx,        edx
    796     movzx       edx,        byte [rsi+rdi]
    797     lea         edx,        [ecx+edx+8]
    798 
    799     ; add up
    800     shr         edx,        4
    801     movd        xmm1,       edx
    802     ; FIXME use pshufb for ssse3 version
    803     pshuflw     xmm1,       xmm1, 0x0
    804     punpcklqdq  xmm1,       xmm1
    805     packuswb    xmm1,       xmm1
    806 
    807     ; write out
    808     mov         rsi,        2
    809     mov         rdi,        arg(0) ;dst;
    810     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    811     lea         rax,        [rcx*3]
    812 
    813 .label
    814     movdqa [rdi      ],     xmm1
    815     movdqa [rdi+rcx  ],     xmm1
    816     movdqa [rdi+rcx*2],     xmm1
    817     movdqa [rdi+rax  ],     xmm1
    818     lea         rdi,        [rdi+rcx*4]
    819     movdqa [rdi      ],     xmm1
    820     movdqa [rdi+rcx  ],     xmm1
    821     movdqa [rdi+rcx*2],     xmm1
    822     movdqa [rdi+rax  ],     xmm1
    823     lea         rdi,        [rdi+rcx*4]
    824     dec         rsi
    825     jnz .label
    826 
    827     ; begin epilog
    828     pop         rdi
    829     pop         rsi
    830     UNSHADOW_ARGS
    831     pop         rbp
    832     ret
    833 
    834 ;void vp8_intra_pred_y_dc128_sse2(
    835 ;    unsigned char *dst,
    836 ;    int dst_stride
    837 ;    unsigned char *above,
    838 ;    unsigned char *left,
    839 ;    int left_stride
    840 ;    )
    841 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
    842 sym(vp8_intra_pred_y_dc128_sse2):
    843     push        rbp
    844     mov         rbp, rsp
    845     SHADOW_ARGS_TO_STACK 5
    846     push        rsi
    847     GET_GOT     rbx
    848     ; end prolog
    849 
    850     ;arg(2), arg(3), arg(4) not used
    851 
    852     ; write out
    853     mov         rsi,        2
    854     movdqa      xmm1,       [GLOBAL(dc_128)]
    855     mov         rax,        arg(0) ;dst;
    856     movsxd      rdx,        dword ptr arg(1) ;dst_stride
    857     lea         rcx,        [rdx*3]
    858 
    859 .label
    860     movdqa [rax      ],     xmm1
    861     movdqa [rax+rdx  ],     xmm1
    862     movdqa [rax+rdx*2],     xmm1
    863     movdqa [rax+rcx  ],     xmm1
    864     lea         rax,        [rax+rdx*4]
    865     movdqa [rax      ],     xmm1
    866     movdqa [rax+rdx  ],     xmm1
    867     movdqa [rax+rdx*2],     xmm1
    868     movdqa [rax+rcx  ],     xmm1
    869     lea         rax,        [rax+rdx*4]
    870     dec         rsi
    871     jnz .label
    872 
    873     ; begin epilog
    874     RESTORE_GOT
    875     pop         rsi
    876     UNSHADOW_ARGS
    877     pop         rbp
    878     ret
    879 
    880 ;void vp8_intra_pred_y_tm_sse2(
    881 ;    unsigned char *dst,
    882 ;    int dst_stride
    883 ;    unsigned char *above,
    884 ;    unsigned char *left,
    885 ;    int left_stride
    886 ;    )
    887 %macro vp8_intra_pred_y_tm 1
    888 global sym(vp8_intra_pred_y_tm_%1) PRIVATE
    889 sym(vp8_intra_pred_y_tm_%1):
    890     push        rbp
    891     mov         rbp, rsp
    892     SHADOW_ARGS_TO_STACK 5
    893     SAVE_XMM 7
    894     push        rsi
    895     push        rdi
    896     GET_GOT     rbx
    897     ; end prolog
    898 
    899     ; read top row
    900     mov         edx,        8
    901     mov         rsi,        arg(2) ;above
    902     movsxd      rax,        dword ptr arg(4) ;left_stride;
    903     pxor        xmm0,       xmm0
    904 %ifidn %1, ssse3
    905     movdqa      xmm3,       [GLOBAL(dc_1024)]
    906 %endif
    907     movdqa      xmm1,       [rsi]
    908     movdqa      xmm2,       xmm1
    909     punpcklbw   xmm1,       xmm0
    910     punpckhbw   xmm2,       xmm0
    911 
    912     ; set up left ptrs ans subtract topleft
    913     movd        xmm4,       [rsi-1]
    914     mov         rsi,        arg(3) ;left
    915 %ifidn %1, sse2
    916     punpcklbw   xmm4,       xmm0
    917     pshuflw     xmm4,       xmm4, 0x0
    918     punpcklqdq  xmm4,       xmm4
    919 %else
    920     pshufb      xmm4,       xmm3
    921 %endif
    922     psubw       xmm1,       xmm4
    923     psubw       xmm2,       xmm4
    924 
    925     ; set up dest ptrs
    926     mov         rdi,        arg(0) ;dst;
    927     movsxd      rcx,        dword ptr arg(1) ;dst_stride
    928 vp8_intra_pred_y_tm_%1_loop:
    929     movd        xmm4,       [rsi]
    930     movd        xmm5,       [rsi+rax]
    931 %ifidn %1, sse2
    932     punpcklbw   xmm4,       xmm0
    933     punpcklbw   xmm5,       xmm0
    934     pshuflw     xmm4,       xmm4, 0x0
    935     pshuflw     xmm5,       xmm5, 0x0
    936     punpcklqdq  xmm4,       xmm4
    937     punpcklqdq  xmm5,       xmm5
    938 %else
    939     pshufb      xmm4,       xmm3
    940     pshufb      xmm5,       xmm3
    941 %endif
    942     movdqa      xmm6,       xmm4
    943     movdqa      xmm7,       xmm5
    944     paddw       xmm4,       xmm1
    945     paddw       xmm6,       xmm2
    946     paddw       xmm5,       xmm1
    947     paddw       xmm7,       xmm2
    948     packuswb    xmm4,       xmm6
    949     packuswb    xmm5,       xmm7
    950     movdqa [rdi    ],       xmm4
    951     movdqa [rdi+rcx],       xmm5
    952     lea         rsi,        [rsi+rax*2]
    953     lea         rdi,        [rdi+rcx*2]
    954     dec         edx
    955     jnz vp8_intra_pred_y_tm_%1_loop
    956 
    957     ; begin epilog
    958     RESTORE_GOT
    959     pop         rdi
    960     pop         rsi
    961     RESTORE_XMM
    962     UNSHADOW_ARGS
    963     pop         rbp
    964     ret
    965 %endmacro
    966 
    967 vp8_intra_pred_y_tm sse2
    968 vp8_intra_pred_y_tm ssse3
    969 
    970 ;void vp8_intra_pred_y_ve_sse2(
    971 ;    unsigned char *dst,
    972 ;    int dst_stride
    973 ;    unsigned char *above,
    974 ;    unsigned char *left,
    975 ;    int left_stride
    976 ;    )
    977 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
    978 sym(vp8_intra_pred_y_ve_sse2):
    979     push        rbp
    980     mov         rbp, rsp
    981     SHADOW_ARGS_TO_STACK 5
    982     push        rsi
    983     ; end prolog
    984 
    985     ;arg(3), arg(4) not used
    986 
    987     mov         rax,        arg(2) ;above;
    988     mov         rsi,        2
    989     movsxd      rdx,        dword ptr arg(1) ;dst_stride
    990 
    991     ; read from top
    992     movdqa      xmm1,       [rax]
    993 
    994     ; write out
    995     mov         rax,        arg(0) ;dst;
    996     lea         rcx,        [rdx*3]
    997 
    998 .label
    999     movdqa [rax      ],     xmm1
   1000     movdqa [rax+rdx  ],     xmm1
   1001     movdqa [rax+rdx*2],     xmm1
   1002     movdqa [rax+rcx  ],     xmm1
   1003     lea         rax,        [rax+rdx*4]
   1004     movdqa [rax      ],     xmm1
   1005     movdqa [rax+rdx  ],     xmm1
   1006     movdqa [rax+rdx*2],     xmm1
   1007     movdqa [rax+rcx  ],     xmm1
   1008     lea         rax,        [rax+rdx*4]
   1009     dec         rsi
   1010     jnz .label
   1011 
   1012     ; begin epilog
   1013     pop         rsi
   1014     UNSHADOW_ARGS
   1015     pop         rbp
   1016     ret
   1017 
   1018 ;void vp8_intra_pred_y_ho_sse2(
   1019 ;    unsigned char *dst,
   1020 ;    int dst_stride
   1021 ;    unsigned char *above,
   1022 ;    unsigned char *left,
   1023 ;    int left_stride,
   1024 ;    )
   1025 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
   1026 sym(vp8_intra_pred_y_ho_sse2):
   1027     push        rbp
   1028     mov         rbp, rsp
   1029     SHADOW_ARGS_TO_STACK 5
   1030     push        rsi
   1031     push        rdi
   1032     ; end prolog
   1033 
   1034     ;arg(2) not used
   1035 
   1036     ; read from left and write out
   1037     mov         edx,        8
   1038     mov         rsi,        arg(3) ;left;
   1039     movsxd      rax,        dword ptr arg(4) ;left_stride;
   1040     mov         rdi,        arg(0) ;dst;
   1041     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   1042 
   1043 vp8_intra_pred_y_ho_sse2_loop:
   1044     movd        xmm0,       [rsi]
   1045     movd        xmm1,       [rsi+rax]
   1046     ; FIXME use pshufb for ssse3 version
   1047     punpcklbw   xmm0,       xmm0
   1048     punpcklbw   xmm1,       xmm1
   1049     pshuflw     xmm0,       xmm0, 0x0
   1050     pshuflw     xmm1,       xmm1, 0x0
   1051     punpcklqdq  xmm0,       xmm0
   1052     punpcklqdq  xmm1,       xmm1
   1053     movdqa [rdi    ],       xmm0
   1054     movdqa [rdi+rcx],       xmm1
   1055     lea         rsi,        [rsi+rax*2]
   1056     lea         rdi,        [rdi+rcx*2]
   1057     dec         edx
   1058     jnz vp8_intra_pred_y_ho_sse2_loop
   1059 
   1060     ; begin epilog
   1061     pop         rdi
   1062     pop         rsi
   1063     UNSHADOW_ARGS
   1064     pop         rbp
   1065     ret
   1066 
   1067 SECTION_RODATA
   1068 align 16
   1069 dc_128:
   1070     times 16 db 128
   1071 dc_4:
   1072     times 4 dw 4
   1073 align 16
   1074 dc_8:
   1075     times 8 dw 8
   1076 align 16
   1077 dc_1024:
   1078     times 8 dw 0x400
   1079 align 16
   1080 dc_00001111:
   1081     times 8 db 0
   1082     times 8 db 1
   1083