Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
     15 ;overflow.
     16 
     17 %macro GET_FILTERS_4 0
     18     mov         rdx, arg(5)                 ;filter ptr
     19     mov         rcx, 0x0400040
     20 
     21     movdqa      xmm7, [rdx]                 ;load filters
     22     pshuflw     xmm0, xmm7, 0b              ;k0
     23     pshuflw     xmm1, xmm7, 01010101b       ;k1
     24     pshuflw     xmm2, xmm7, 10101010b       ;k2
     25     pshuflw     xmm3, xmm7, 11111111b       ;k3
     26     psrldq      xmm7, 8
     27     pshuflw     xmm4, xmm7, 0b              ;k4
     28     pshuflw     xmm5, xmm7, 01010101b       ;k5
     29     pshuflw     xmm6, xmm7, 10101010b       ;k6
     30     pshuflw     xmm7, xmm7, 11111111b       ;k7
     31 
     32     punpcklqdq  xmm0, xmm1
     33     punpcklqdq  xmm2, xmm3
     34     punpcklqdq  xmm5, xmm4
     35     punpcklqdq  xmm6, xmm7
     36 
     37     movdqa      k0k1, xmm0
     38     movdqa      k2k3, xmm2
     39     movdqa      k5k4, xmm5
     40     movdqa      k6k7, xmm6
     41 
     42     movq        xmm6, rcx
     43     pshufd      xmm6, xmm6, 0
     44     movdqa      krd, xmm6
     45 
     46     pxor        xmm7, xmm7
     47     movdqa      zero, xmm7
     48 %endm
     49 
     50 %macro APPLY_FILTER_4 1
     51     punpckldq   xmm0, xmm1                  ;two row in one register
     52     punpckldq   xmm6, xmm7
     53     punpckldq   xmm2, xmm3
     54     punpckldq   xmm5, xmm4
     55 
     56     punpcklbw   xmm0, zero                  ;unpack to word
     57     punpcklbw   xmm6, zero
     58     punpcklbw   xmm2, zero
     59     punpcklbw   xmm5, zero
     60 
     61     pmullw      xmm0, k0k1                  ;multiply the filter factors
     62     pmullw      xmm6, k6k7
     63     pmullw      xmm2, k2k3
     64     pmullw      xmm5, k5k4
     65 
     66     paddsw      xmm0, xmm6                  ;sum
     67     movdqa      xmm1, xmm0
     68     psrldq      xmm1, 8
     69     paddsw      xmm0, xmm1
     70     paddsw      xmm0, xmm2
     71     psrldq      xmm2, 8
     72     paddsw      xmm0, xmm5
     73     psrldq      xmm5, 8
     74     paddsw      xmm0, xmm2
     75     paddsw      xmm0, xmm5
     76 
     77     paddsw      xmm0, krd                   ;rounding
     78     psraw       xmm0, 7                     ;shift
     79     packuswb    xmm0, xmm0                  ;pack to byte
     80 
     81 %if %1
     82     movd        xmm1, [rdi]
     83     pavgb       xmm0, xmm1
     84 %endif
     85     movd        [rdi], xmm0
     86 %endm
     87 
     88 %macro GET_FILTERS 0
     89     mov         rdx, arg(5)                 ;filter ptr
     90     mov         rsi, arg(0)                 ;src_ptr
     91     mov         rdi, arg(2)                 ;output_ptr
     92     mov         rcx, 0x0400040
     93 
     94     movdqa      xmm7, [rdx]                 ;load filters
     95     pshuflw     xmm0, xmm7, 0b              ;k0
     96     pshuflw     xmm1, xmm7, 01010101b       ;k1
     97     pshuflw     xmm2, xmm7, 10101010b       ;k2
     98     pshuflw     xmm3, xmm7, 11111111b       ;k3
     99     pshufhw     xmm4, xmm7, 0b              ;k4
    100     pshufhw     xmm5, xmm7, 01010101b       ;k5
    101     pshufhw     xmm6, xmm7, 10101010b       ;k6
    102     pshufhw     xmm7, xmm7, 11111111b       ;k7
    103 
    104     punpcklwd   xmm0, xmm0
    105     punpcklwd   xmm1, xmm1
    106     punpcklwd   xmm2, xmm2
    107     punpcklwd   xmm3, xmm3
    108     punpckhwd   xmm4, xmm4
    109     punpckhwd   xmm5, xmm5
    110     punpckhwd   xmm6, xmm6
    111     punpckhwd   xmm7, xmm7
    112 
    113     movdqa      k0,   xmm0                  ;store filter factors on stack
    114     movdqa      k1,   xmm1
    115     movdqa      k2,   xmm2
    116     movdqa      k3,   xmm3
    117     movdqa      k4,   xmm4
    118     movdqa      k5,   xmm5
    119     movdqa      k6,   xmm6
    120     movdqa      k7,   xmm7
    121 
    122     movq        xmm6, rcx
    123     pshufd      xmm6, xmm6, 0
    124     movdqa      krd, xmm6                   ;rounding
    125 
    126     pxor        xmm7, xmm7
    127     movdqa      zero, xmm7
    128 %endm
    129 
    130 %macro LOAD_VERT_8 1
    131     movq        xmm0, [rsi + %1]            ;0
    132     movq        xmm1, [rsi + rax + %1]      ;1
    133     movq        xmm6, [rsi + rdx * 2 + %1]  ;6
    134     lea         rsi,  [rsi + rax]
    135     movq        xmm7, [rsi + rdx * 2 + %1]  ;7
    136     movq        xmm2, [rsi + rax + %1]      ;2
    137     movq        xmm3, [rsi + rax * 2 + %1]  ;3
    138     movq        xmm4, [rsi + rdx + %1]      ;4
    139     movq        xmm5, [rsi + rax * 4 + %1]  ;5
    140 %endm
    141 
    142 %macro APPLY_FILTER_8 2
    143     punpcklbw   xmm0, zero
    144     punpcklbw   xmm1, zero
    145     punpcklbw   xmm6, zero
    146     punpcklbw   xmm7, zero
    147     punpcklbw   xmm2, zero
    148     punpcklbw   xmm5, zero
    149     punpcklbw   xmm3, zero
    150     punpcklbw   xmm4, zero
    151 
    152     pmullw      xmm0, k0
    153     pmullw      xmm1, k1
    154     pmullw      xmm6, k6
    155     pmullw      xmm7, k7
    156     pmullw      xmm2, k2
    157     pmullw      xmm5, k5
    158     pmullw      xmm3, k3
    159     pmullw      xmm4, k4
    160 
    161     paddsw      xmm0, xmm1
    162     paddsw      xmm0, xmm6
    163     paddsw      xmm0, xmm7
    164     paddsw      xmm0, xmm2
    165     paddsw      xmm0, xmm5
    166     paddsw      xmm0, xmm3
    167     paddsw      xmm0, xmm4
    168 
    169     paddsw      xmm0, krd                   ;rounding
    170     psraw       xmm0, 7                     ;shift
    171     packuswb    xmm0, xmm0                  ;pack back to byte
    172 %if %1
    173     movq        xmm1, [rdi + %2]
    174     pavgb       xmm0, xmm1
    175 %endif
    176     movq        [rdi + %2], xmm0
    177 %endm
    178 
    179 ;void vpx_filter_block1d4_v8_sse2
    180 ;(
    181 ;    unsigned char *src_ptr,
    182 ;    unsigned int   src_pitch,
    183 ;    unsigned char *output_ptr,
    184 ;    unsigned int   out_pitch,
    185 ;    unsigned int   output_height,
    186 ;    short *filter
    187 ;)
    188 global sym(vpx_filter_block1d4_v8_sse2) PRIVATE
    189 sym(vpx_filter_block1d4_v8_sse2):
    190     push        rbp
    191     mov         rbp, rsp
    192     SHADOW_ARGS_TO_STACK 6
    193     SAVE_XMM 7
    194     push        rsi
    195     push        rdi
    196     push        rbx
    197     ; end prolog
    198 
    199     ALIGN_STACK 16, rax
    200     sub         rsp, 16 * 6
    201     %define k0k1 [rsp + 16 * 0]
    202     %define k2k3 [rsp + 16 * 1]
    203     %define k5k4 [rsp + 16 * 2]
    204     %define k6k7 [rsp + 16 * 3]
    205     %define krd [rsp + 16 * 4]
    206     %define zero [rsp + 16 * 5]
    207 
    208     GET_FILTERS_4
    209 
    210     mov         rsi, arg(0)                 ;src_ptr
    211     mov         rdi, arg(2)                 ;output_ptr
    212 
    213     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    214     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    215     lea         rdx, [rax + rax * 2]
    216     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    217 
    218 .loop:
    219     movd        xmm0, [rsi]                 ;load src: row 0
    220     movd        xmm1, [rsi + rax]           ;1
    221     movd        xmm6, [rsi + rdx * 2]       ;6
    222     lea         rsi,  [rsi + rax]
    223     movd        xmm7, [rsi + rdx * 2]       ;7
    224     movd        xmm2, [rsi + rax]           ;2
    225     movd        xmm3, [rsi + rax * 2]       ;3
    226     movd        xmm4, [rsi + rdx]           ;4
    227     movd        xmm5, [rsi + rax * 4]       ;5
    228 
    229     APPLY_FILTER_4 0
    230 
    231     lea         rdi, [rdi + rbx]
    232     dec         rcx
    233     jnz         .loop
    234 
    235     add rsp, 16 * 6
    236     pop rsp
    237     pop rbx
    238     ; begin epilog
    239     pop rdi
    240     pop rsi
    241     RESTORE_XMM
    242     UNSHADOW_ARGS
    243     pop         rbp
    244     ret
    245 
    246 ;void vpx_filter_block1d8_v8_sse2
    247 ;(
    248 ;    unsigned char *src_ptr,
    249 ;    unsigned int   src_pitch,
    250 ;    unsigned char *output_ptr,
    251 ;    unsigned int   out_pitch,
    252 ;    unsigned int   output_height,
    253 ;    short *filter
    254 ;)
    255 global sym(vpx_filter_block1d8_v8_sse2) PRIVATE
    256 sym(vpx_filter_block1d8_v8_sse2):
    257     push        rbp
    258     mov         rbp, rsp
    259     SHADOW_ARGS_TO_STACK 6
    260     SAVE_XMM 7
    261     push        rsi
    262     push        rdi
    263     push        rbx
    264     ; end prolog
    265 
    266     ALIGN_STACK 16, rax
    267     sub         rsp, 16 * 10
    268     %define k0 [rsp + 16 * 0]
    269     %define k1 [rsp + 16 * 1]
    270     %define k2 [rsp + 16 * 2]
    271     %define k3 [rsp + 16 * 3]
    272     %define k4 [rsp + 16 * 4]
    273     %define k5 [rsp + 16 * 5]
    274     %define k6 [rsp + 16 * 6]
    275     %define k7 [rsp + 16 * 7]
    276     %define krd [rsp + 16 * 8]
    277     %define zero [rsp + 16 * 9]
    278 
    279     GET_FILTERS
    280 
    281     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    282     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    283     lea         rdx, [rax + rax * 2]
    284     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    285 
    286 .loop:
    287     LOAD_VERT_8 0
    288     APPLY_FILTER_8 0, 0
    289 
    290     lea         rdi, [rdi + rbx]
    291     dec         rcx
    292     jnz         .loop
    293 
    294     add rsp, 16 * 10
    295     pop rsp
    296     pop rbx
    297     ; begin epilog
    298     pop rdi
    299     pop rsi
    300     RESTORE_XMM
    301     UNSHADOW_ARGS
    302     pop         rbp
    303     ret
    304 
    305 ;void vpx_filter_block1d16_v8_sse2
    306 ;(
    307 ;    unsigned char *src_ptr,
    308 ;    unsigned int   src_pitch,
    309 ;    unsigned char *output_ptr,
    310 ;    unsigned int   out_pitch,
    311 ;    unsigned int   output_height,
    312 ;    short *filter
    313 ;)
    314 global sym(vpx_filter_block1d16_v8_sse2) PRIVATE
    315 sym(vpx_filter_block1d16_v8_sse2):
    316     push        rbp
    317     mov         rbp, rsp
    318     SHADOW_ARGS_TO_STACK 6
    319     SAVE_XMM 7
    320     push        rsi
    321     push        rdi
    322     push        rbx
    323     ; end prolog
    324 
    325     ALIGN_STACK 16, rax
    326     sub         rsp, 16 * 10
    327     %define k0 [rsp + 16 * 0]
    328     %define k1 [rsp + 16 * 1]
    329     %define k2 [rsp + 16 * 2]
    330     %define k3 [rsp + 16 * 3]
    331     %define k4 [rsp + 16 * 4]
    332     %define k5 [rsp + 16 * 5]
    333     %define k6 [rsp + 16 * 6]
    334     %define k7 [rsp + 16 * 7]
    335     %define krd [rsp + 16 * 8]
    336     %define zero [rsp + 16 * 9]
    337 
    338     GET_FILTERS
    339 
    340     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    341     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    342     lea         rdx, [rax + rax * 2]
    343     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    344 
    345 .loop:
    346     LOAD_VERT_8 0
    347     APPLY_FILTER_8 0, 0
    348     sub         rsi, rax
    349 
    350     LOAD_VERT_8 8
    351     APPLY_FILTER_8 0, 8
    352     add         rdi, rbx
    353 
    354     dec         rcx
    355     jnz         .loop
    356 
    357     add rsp, 16 * 10
    358     pop rsp
    359     pop rbx
    360     ; begin epilog
    361     pop rdi
    362     pop rsi
    363     RESTORE_XMM
    364     UNSHADOW_ARGS
    365     pop         rbp
    366     ret
    367 
    368 global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE
    369 sym(vpx_filter_block1d4_v8_avg_sse2):
    370     push        rbp
    371     mov         rbp, rsp
    372     SHADOW_ARGS_TO_STACK 6
    373     SAVE_XMM 7
    374     push        rsi
    375     push        rdi
    376     push        rbx
    377     ; end prolog
    378 
    379     ALIGN_STACK 16, rax
    380     sub         rsp, 16 * 6
    381     %define k0k1 [rsp + 16 * 0]
    382     %define k2k3 [rsp + 16 * 1]
    383     %define k5k4 [rsp + 16 * 2]
    384     %define k6k7 [rsp + 16 * 3]
    385     %define krd [rsp + 16 * 4]
    386     %define zero [rsp + 16 * 5]
    387 
    388     GET_FILTERS_4
    389 
    390     mov         rsi, arg(0)                 ;src_ptr
    391     mov         rdi, arg(2)                 ;output_ptr
    392 
    393     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    394     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    395     lea         rdx, [rax + rax * 2]
    396     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    397 
    398 .loop:
    399     movd        xmm0, [rsi]                 ;load src: row 0
    400     movd        xmm1, [rsi + rax]           ;1
    401     movd        xmm6, [rsi + rdx * 2]       ;6
    402     lea         rsi,  [rsi + rax]
    403     movd        xmm7, [rsi + rdx * 2]       ;7
    404     movd        xmm2, [rsi + rax]           ;2
    405     movd        xmm3, [rsi + rax * 2]       ;3
    406     movd        xmm4, [rsi + rdx]           ;4
    407     movd        xmm5, [rsi + rax * 4]       ;5
    408 
    409     APPLY_FILTER_4 1
    410 
    411     lea         rdi, [rdi + rbx]
    412     dec         rcx
    413     jnz         .loop
    414 
    415     add rsp, 16 * 6
    416     pop rsp
    417     pop rbx
    418     ; begin epilog
    419     pop rdi
    420     pop rsi
    421     RESTORE_XMM
    422     UNSHADOW_ARGS
    423     pop         rbp
    424     ret
    425 
    426 global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE
    427 sym(vpx_filter_block1d8_v8_avg_sse2):
    428     push        rbp
    429     mov         rbp, rsp
    430     SHADOW_ARGS_TO_STACK 6
    431     SAVE_XMM 7
    432     push        rsi
    433     push        rdi
    434     push        rbx
    435     ; end prolog
    436 
    437     ALIGN_STACK 16, rax
    438     sub         rsp, 16 * 10
    439     %define k0 [rsp + 16 * 0]
    440     %define k1 [rsp + 16 * 1]
    441     %define k2 [rsp + 16 * 2]
    442     %define k3 [rsp + 16 * 3]
    443     %define k4 [rsp + 16 * 4]
    444     %define k5 [rsp + 16 * 5]
    445     %define k6 [rsp + 16 * 6]
    446     %define k7 [rsp + 16 * 7]
    447     %define krd [rsp + 16 * 8]
    448     %define zero [rsp + 16 * 9]
    449 
    450     GET_FILTERS
    451 
    452     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    453     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    454     lea         rdx, [rax + rax * 2]
    455     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    456 .loop:
    457     LOAD_VERT_8 0
    458     APPLY_FILTER_8 1, 0
    459 
    460     lea         rdi, [rdi + rbx]
    461     dec         rcx
    462     jnz         .loop
    463 
    464     add rsp, 16 * 10
    465     pop rsp
    466     pop rbx
    467     ; begin epilog
    468     pop rdi
    469     pop rsi
    470     RESTORE_XMM
    471     UNSHADOW_ARGS
    472     pop         rbp
    473     ret
    474 
    475 global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE
    476 sym(vpx_filter_block1d16_v8_avg_sse2):
    477     push        rbp
    478     mov         rbp, rsp
    479     SHADOW_ARGS_TO_STACK 6
    480     SAVE_XMM 7
    481     push        rsi
    482     push        rdi
    483     push        rbx
    484     ; end prolog
    485 
    486     ALIGN_STACK 16, rax
    487     sub         rsp, 16 * 10
    488     %define k0 [rsp + 16 * 0]
    489     %define k1 [rsp + 16 * 1]
    490     %define k2 [rsp + 16 * 2]
    491     %define k3 [rsp + 16 * 3]
    492     %define k4 [rsp + 16 * 4]
    493     %define k5 [rsp + 16 * 5]
    494     %define k6 [rsp + 16 * 6]
    495     %define k7 [rsp + 16 * 7]
    496     %define krd [rsp + 16 * 8]
    497     %define zero [rsp + 16 * 9]
    498 
    499     GET_FILTERS
    500 
    501     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    502     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    503     lea         rdx, [rax + rax * 2]
    504     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    505 .loop:
    506     LOAD_VERT_8 0
    507     APPLY_FILTER_8 1, 0
    508     sub         rsi, rax
    509 
    510     LOAD_VERT_8 8
    511     APPLY_FILTER_8 1, 8
    512     add         rdi, rbx
    513 
    514     dec         rcx
    515     jnz         .loop
    516 
    517     add rsp, 16 * 10
    518     pop rsp
    519     pop rbx
    520     ; begin epilog
    521     pop rdi
    522     pop rsi
    523     RESTORE_XMM
    524     UNSHADOW_ARGS
    525     pop         rbp
    526     ret
    527 
    528 ;void vpx_filter_block1d4_h8_sse2
    529 ;(
    530 ;    unsigned char  *src_ptr,
    531 ;    unsigned int    src_pixels_per_line,
    532 ;    unsigned char  *output_ptr,
    533 ;    unsigned int    output_pitch,
    534 ;    unsigned int    output_height,
    535 ;    short *filter
    536 ;)
    537 global sym(vpx_filter_block1d4_h8_sse2) PRIVATE
    538 sym(vpx_filter_block1d4_h8_sse2):
    539     push        rbp
    540     mov         rbp, rsp
    541     SHADOW_ARGS_TO_STACK 6
    542     SAVE_XMM 7
    543     push        rsi
    544     push        rdi
    545     ; end prolog
    546 
    547     ALIGN_STACK 16, rax
    548     sub         rsp, 16 * 6
    549     %define k0k1 [rsp + 16 * 0]
    550     %define k2k3 [rsp + 16 * 1]
    551     %define k5k4 [rsp + 16 * 2]
    552     %define k6k7 [rsp + 16 * 3]
    553     %define krd [rsp + 16 * 4]
    554     %define zero [rsp + 16 * 5]
    555 
    556     GET_FILTERS_4
    557 
    558     mov         rsi, arg(0)                 ;src_ptr
    559     mov         rdi, arg(2)                 ;output_ptr
    560 
    561     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    562     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    563     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    564 
    565 .loop:
    566     movdqu      xmm0,   [rsi - 3]           ;load src
    567 
    568     movdqa      xmm1, xmm0
    569     movdqa      xmm6, xmm0
    570     movdqa      xmm7, xmm0
    571     movdqa      xmm2, xmm0
    572     movdqa      xmm3, xmm0
    573     movdqa      xmm5, xmm0
    574     movdqa      xmm4, xmm0
    575 
    576     psrldq      xmm1, 1
    577     psrldq      xmm6, 6
    578     psrldq      xmm7, 7
    579     psrldq      xmm2, 2
    580     psrldq      xmm3, 3
    581     psrldq      xmm5, 5
    582     psrldq      xmm4, 4
    583 
    584     APPLY_FILTER_4 0
    585 
    586     lea         rsi, [rsi + rax]
    587     lea         rdi, [rdi + rdx]
    588     dec         rcx
    589     jnz         .loop
    590 
    591     add rsp, 16 * 6
    592     pop rsp
    593 
    594     ; begin epilog
    595     pop rdi
    596     pop rsi
    597     RESTORE_XMM
    598     UNSHADOW_ARGS
    599     pop         rbp
    600     ret
    601 
    602 ;void vpx_filter_block1d8_h8_sse2
    603 ;(
    604 ;    unsigned char  *src_ptr,
    605 ;    unsigned int    src_pixels_per_line,
    606 ;    unsigned char  *output_ptr,
    607 ;    unsigned int    output_pitch,
    608 ;    unsigned int    output_height,
    609 ;    short *filter
    610 ;)
    611 global sym(vpx_filter_block1d8_h8_sse2) PRIVATE
    612 sym(vpx_filter_block1d8_h8_sse2):
    613     push        rbp
    614     mov         rbp, rsp
    615     SHADOW_ARGS_TO_STACK 6
    616     SAVE_XMM 7
    617     push        rsi
    618     push        rdi
    619     ; end prolog
    620 
    621     ALIGN_STACK 16, rax
    622     sub         rsp, 16 * 10
    623     %define k0 [rsp + 16 * 0]
    624     %define k1 [rsp + 16 * 1]
    625     %define k2 [rsp + 16 * 2]
    626     %define k3 [rsp + 16 * 3]
    627     %define k4 [rsp + 16 * 4]
    628     %define k5 [rsp + 16 * 5]
    629     %define k6 [rsp + 16 * 6]
    630     %define k7 [rsp + 16 * 7]
    631     %define krd [rsp + 16 * 8]
    632     %define zero [rsp + 16 * 9]
    633 
    634     GET_FILTERS
    635 
    636     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    637     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    638     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    639 
    640 .loop:
    641     movdqu      xmm0,   [rsi - 3]           ;load src
    642 
    643     movdqa      xmm1, xmm0
    644     movdqa      xmm6, xmm0
    645     movdqa      xmm7, xmm0
    646     movdqa      xmm2, xmm0
    647     movdqa      xmm5, xmm0
    648     movdqa      xmm3, xmm0
    649     movdqa      xmm4, xmm0
    650 
    651     psrldq      xmm1, 1
    652     psrldq      xmm6, 6
    653     psrldq      xmm7, 7
    654     psrldq      xmm2, 2
    655     psrldq      xmm5, 5
    656     psrldq      xmm3, 3
    657     psrldq      xmm4, 4
    658 
    659     APPLY_FILTER_8 0, 0
    660 
    661     lea         rsi, [rsi + rax]
    662     lea         rdi, [rdi + rdx]
    663     dec         rcx
    664     jnz         .loop
    665 
    666     add rsp, 16 * 10
    667     pop rsp
    668 
    669     ; begin epilog
    670     pop rdi
    671     pop rsi
    672     RESTORE_XMM
    673     UNSHADOW_ARGS
    674     pop         rbp
    675     ret
    676 
    677 ;void vpx_filter_block1d16_h8_sse2
    678 ;(
    679 ;    unsigned char  *src_ptr,
    680 ;    unsigned int    src_pixels_per_line,
    681 ;    unsigned char  *output_ptr,
    682 ;    unsigned int    output_pitch,
    683 ;    unsigned int    output_height,
    684 ;    short *filter
    685 ;)
    686 global sym(vpx_filter_block1d16_h8_sse2) PRIVATE
    687 sym(vpx_filter_block1d16_h8_sse2):
    688     push        rbp
    689     mov         rbp, rsp
    690     SHADOW_ARGS_TO_STACK 6
    691     SAVE_XMM 7
    692     push        rsi
    693     push        rdi
    694     ; end prolog
    695 
    696     ALIGN_STACK 16, rax
    697     sub         rsp, 16 * 10
    698     %define k0 [rsp + 16 * 0]
    699     %define k1 [rsp + 16 * 1]
    700     %define k2 [rsp + 16 * 2]
    701     %define k3 [rsp + 16 * 3]
    702     %define k4 [rsp + 16 * 4]
    703     %define k5 [rsp + 16 * 5]
    704     %define k6 [rsp + 16 * 6]
    705     %define k7 [rsp + 16 * 7]
    706     %define krd [rsp + 16 * 8]
    707     %define zero [rsp + 16 * 9]
    708 
    709     GET_FILTERS
    710 
    711     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    712     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    713     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    714 
    715 .loop:
    716     movdqu      xmm0,   [rsi - 3]           ;load src
    717 
    718     movdqa      xmm1, xmm0
    719     movdqa      xmm6, xmm0
    720     movdqa      xmm7, xmm0
    721     movdqa      xmm2, xmm0
    722     movdqa      xmm5, xmm0
    723     movdqa      xmm3, xmm0
    724     movdqa      xmm4, xmm0
    725 
    726     psrldq      xmm1, 1
    727     psrldq      xmm6, 6
    728     psrldq      xmm7, 7
    729     psrldq      xmm2, 2
    730     psrldq      xmm5, 5
    731     psrldq      xmm3, 3
    732     psrldq      xmm4, 4
    733 
    734     APPLY_FILTER_8 0, 0
    735 
    736     movdqu      xmm0,   [rsi + 5]           ;load src
    737 
    738     movdqa      xmm1, xmm0
    739     movdqa      xmm6, xmm0
    740     movdqa      xmm7, xmm0
    741     movdqa      xmm2, xmm0
    742     movdqa      xmm5, xmm0
    743     movdqa      xmm3, xmm0
    744     movdqa      xmm4, xmm0
    745 
    746     psrldq      xmm1, 1
    747     psrldq      xmm6, 6
    748     psrldq      xmm7, 7
    749     psrldq      xmm2, 2
    750     psrldq      xmm5, 5
    751     psrldq      xmm3, 3
    752     psrldq      xmm4, 4
    753 
    754     APPLY_FILTER_8 0, 8
    755 
    756     lea         rsi, [rsi + rax]
    757     lea         rdi, [rdi + rdx]
    758     dec         rcx
    759     jnz         .loop
    760 
    761     add rsp, 16 * 10
    762     pop rsp
    763 
    764     ; begin epilog
    765     pop rdi
    766     pop rsi
    767     RESTORE_XMM
    768     UNSHADOW_ARGS
    769     pop         rbp
    770     ret
    771 
    772 global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE
    773 sym(vpx_filter_block1d4_h8_avg_sse2):
    774     push        rbp
    775     mov         rbp, rsp
    776     SHADOW_ARGS_TO_STACK 6
    777     SAVE_XMM 7
    778     push        rsi
    779     push        rdi
    780     ; end prolog
    781 
    782     ALIGN_STACK 16, rax
    783     sub         rsp, 16 * 6
    784     %define k0k1 [rsp + 16 * 0]
    785     %define k2k3 [rsp + 16 * 1]
    786     %define k5k4 [rsp + 16 * 2]
    787     %define k6k7 [rsp + 16 * 3]
    788     %define krd [rsp + 16 * 4]
    789     %define zero [rsp + 16 * 5]
    790 
    791     GET_FILTERS_4
    792 
    793     mov         rsi, arg(0)                 ;src_ptr
    794     mov         rdi, arg(2)                 ;output_ptr
    795 
    796     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    797     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    798     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    799 
    800 .loop:
    801     movdqu      xmm0,   [rsi - 3]           ;load src
    802 
    803     movdqa      xmm1, xmm0
    804     movdqa      xmm6, xmm0
    805     movdqa      xmm7, xmm0
    806     movdqa      xmm2, xmm0
    807     movdqa      xmm3, xmm0
    808     movdqa      xmm5, xmm0
    809     movdqa      xmm4, xmm0
    810 
    811     psrldq      xmm1, 1
    812     psrldq      xmm6, 6
    813     psrldq      xmm7, 7
    814     psrldq      xmm2, 2
    815     psrldq      xmm3, 3
    816     psrldq      xmm5, 5
    817     psrldq      xmm4, 4
    818 
    819     APPLY_FILTER_4 1
    820 
    821     lea         rsi, [rsi + rax]
    822     lea         rdi, [rdi + rdx]
    823     dec         rcx
    824     jnz         .loop
    825 
    826     add rsp, 16 * 6
    827     pop rsp
    828 
    829     ; begin epilog
    830     pop rdi
    831     pop rsi
    832     RESTORE_XMM
    833     UNSHADOW_ARGS
    834     pop         rbp
    835     ret
    836 
    837 global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE
    838 sym(vpx_filter_block1d8_h8_avg_sse2):
    839     push        rbp
    840     mov         rbp, rsp
    841     SHADOW_ARGS_TO_STACK 6
    842     SAVE_XMM 7
    843     push        rsi
    844     push        rdi
    845     ; end prolog
    846 
    847     ALIGN_STACK 16, rax
    848     sub         rsp, 16 * 10
    849     %define k0 [rsp + 16 * 0]
    850     %define k1 [rsp + 16 * 1]
    851     %define k2 [rsp + 16 * 2]
    852     %define k3 [rsp + 16 * 3]
    853     %define k4 [rsp + 16 * 4]
    854     %define k5 [rsp + 16 * 5]
    855     %define k6 [rsp + 16 * 6]
    856     %define k7 [rsp + 16 * 7]
    857     %define krd [rsp + 16 * 8]
    858     %define zero [rsp + 16 * 9]
    859 
    860     GET_FILTERS
    861 
    862     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    863     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    864     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    865 
    866 .loop:
    867     movdqu      xmm0,   [rsi - 3]           ;load src
    868 
    869     movdqa      xmm1, xmm0
    870     movdqa      xmm6, xmm0
    871     movdqa      xmm7, xmm0
    872     movdqa      xmm2, xmm0
    873     movdqa      xmm5, xmm0
    874     movdqa      xmm3, xmm0
    875     movdqa      xmm4, xmm0
    876 
    877     psrldq      xmm1, 1
    878     psrldq      xmm6, 6
    879     psrldq      xmm7, 7
    880     psrldq      xmm2, 2
    881     psrldq      xmm5, 5
    882     psrldq      xmm3, 3
    883     psrldq      xmm4, 4
    884 
    885     APPLY_FILTER_8 1, 0
    886 
    887     lea         rsi, [rsi + rax]
    888     lea         rdi, [rdi + rdx]
    889     dec         rcx
    890     jnz         .loop
    891 
    892     add rsp, 16 * 10
    893     pop rsp
    894 
    895     ; begin epilog
    896     pop rdi
    897     pop rsi
    898     RESTORE_XMM
    899     UNSHADOW_ARGS
    900     pop         rbp
    901     ret
    902 
    903 global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE
    904 sym(vpx_filter_block1d16_h8_avg_sse2):
    905     push        rbp
    906     mov         rbp, rsp
    907     SHADOW_ARGS_TO_STACK 6
    908     SAVE_XMM 7
    909     push        rsi
    910     push        rdi
    911     ; end prolog
    912 
    913     ALIGN_STACK 16, rax
    914     sub         rsp, 16 * 10
    915     %define k0 [rsp + 16 * 0]
    916     %define k1 [rsp + 16 * 1]
    917     %define k2 [rsp + 16 * 2]
    918     %define k3 [rsp + 16 * 3]
    919     %define k4 [rsp + 16 * 4]
    920     %define k5 [rsp + 16 * 5]
    921     %define k6 [rsp + 16 * 6]
    922     %define k7 [rsp + 16 * 7]
    923     %define krd [rsp + 16 * 8]
    924     %define zero [rsp + 16 * 9]
    925 
    926     GET_FILTERS
    927 
    928     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    929     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    930     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    931 
    932 .loop:
    933     movdqu      xmm0,   [rsi - 3]           ;load src
    934 
    935     movdqa      xmm1, xmm0
    936     movdqa      xmm6, xmm0
    937     movdqa      xmm7, xmm0
    938     movdqa      xmm2, xmm0
    939     movdqa      xmm5, xmm0
    940     movdqa      xmm3, xmm0
    941     movdqa      xmm4, xmm0
    942 
    943     psrldq      xmm1, 1
    944     psrldq      xmm6, 6
    945     psrldq      xmm7, 7
    946     psrldq      xmm2, 2
    947     psrldq      xmm5, 5
    948     psrldq      xmm3, 3
    949     psrldq      xmm4, 4
    950 
    951     APPLY_FILTER_8 1, 0
    952 
    953     movdqu      xmm0,   [rsi + 5]           ;load src
    954 
    955     movdqa      xmm1, xmm0
    956     movdqa      xmm6, xmm0
    957     movdqa      xmm7, xmm0
    958     movdqa      xmm2, xmm0
    959     movdqa      xmm5, xmm0
    960     movdqa      xmm3, xmm0
    961     movdqa      xmm4, xmm0
    962 
    963     psrldq      xmm1, 1
    964     psrldq      xmm6, 6
    965     psrldq      xmm7, 7
    966     psrldq      xmm2, 2
    967     psrldq      xmm5, 5
    968     psrldq      xmm3, 3
    969     psrldq      xmm4, 4
    970 
    971     APPLY_FILTER_8 1, 8
    972 
    973     lea         rsi, [rsi + rax]
    974     lea         rdi, [rdi + rdx]
    975     dec         rcx
    976     jnz         .loop
    977 
    978     add rsp, 16 * 10
    979     pop rsp
    980 
    981     ; begin epilog
    982     pop rdi
    983     pop rsi
    984     RESTORE_XMM
    985     UNSHADOW_ARGS
    986     pop         rbp
    987     ret
    988