Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
     15 ;overflow.
     16 
     17 %macro HIGH_GET_FILTERS_4 0
     18     mov         rdx, arg(5)                 ;filter ptr
     19     mov         rcx, 0x00000040
     20 
     21     movdqa      xmm7, [rdx]                 ;load filters
     22     pshuflw     xmm0, xmm7, 0b              ;k0
     23     pshuflw     xmm1, xmm7, 01010101b       ;k1
     24     pshuflw     xmm2, xmm7, 10101010b       ;k2
     25     pshuflw     xmm3, xmm7, 11111111b       ;k3
     26     psrldq      xmm7, 8
     27     pshuflw     xmm4, xmm7, 0b              ;k4
     28     pshuflw     xmm5, xmm7, 01010101b       ;k5
     29     pshuflw     xmm6, xmm7, 10101010b       ;k6
     30     pshuflw     xmm7, xmm7, 11111111b       ;k7
     31 
     32     punpcklwd   xmm0, xmm6
     33     punpcklwd   xmm2, xmm5
     34     punpcklwd   xmm3, xmm4
     35     punpcklwd   xmm1, xmm7
     36 
     37     movdqa      k0k6, xmm0
     38     movdqa      k2k5, xmm2
     39     movdqa      k3k4, xmm3
     40     movdqa      k1k7, xmm1
     41 
     42     movq        xmm6, rcx
     43     pshufd      xmm6, xmm6, 0
     44     movdqa      krd, xmm6
     45 
     46     ;Compute max and min values of a pixel
     47     mov         rdx, 0x00010001
     48     movsxd      rcx, DWORD PTR arg(6)      ;bps
     49     movq        xmm0, rdx
     50     movq        xmm1, rcx
     51     pshufd      xmm0, xmm0, 0b
     52     movdqa      xmm2, xmm0
     53     psllw       xmm0, xmm1
     54     psubw       xmm0, xmm2
     55     pxor        xmm1, xmm1
     56     movdqa      max, xmm0                  ;max value (for clamping)
     57     movdqa      min, xmm1                  ;min value (for clamping)
     58 
     59 %endm
     60 
     61 %macro HIGH_APPLY_FILTER_4 1
     62     punpcklwd   xmm0, xmm6                  ;two row in one register
     63     punpcklwd   xmm1, xmm7
     64     punpcklwd   xmm2, xmm5
     65     punpcklwd   xmm3, xmm4
     66 
     67     pmaddwd     xmm0, k0k6                  ;multiply the filter factors
     68     pmaddwd     xmm1, k1k7
     69     pmaddwd     xmm2, k2k5
     70     pmaddwd     xmm3, k3k4
     71 
     72     paddd       xmm0, xmm1                  ;sum
     73     paddd       xmm0, xmm2
     74     paddd       xmm0, xmm3
     75 
     76     paddd       xmm0, krd                   ;rounding
     77     psrad       xmm0, 7                     ;shift
     78     packssdw    xmm0, xmm0                  ;pack to word
     79 
     80     ;clamp the values
     81     pminsw      xmm0, max
     82     pmaxsw      xmm0, min
     83 
     84 %if %1
     85     movq        xmm1, [rdi]
     86     pavgw       xmm0, xmm1
     87 %endif
     88     movq        [rdi], xmm0
     89 %endm
     90 
     91 %macro HIGH_GET_FILTERS 0
     92     mov         rdx, arg(5)                 ;filter ptr
     93     mov         rsi, arg(0)                 ;src_ptr
     94     mov         rdi, arg(2)                 ;output_ptr
     95     mov         rcx, 0x00000040
     96 
     97     movdqa      xmm7, [rdx]                 ;load filters
     98     pshuflw     xmm0, xmm7, 0b              ;k0
     99     pshuflw     xmm1, xmm7, 01010101b       ;k1
    100     pshuflw     xmm2, xmm7, 10101010b       ;k2
    101     pshuflw     xmm3, xmm7, 11111111b       ;k3
    102     pshufhw     xmm4, xmm7, 0b              ;k4
    103     pshufhw     xmm5, xmm7, 01010101b       ;k5
    104     pshufhw     xmm6, xmm7, 10101010b       ;k6
    105     pshufhw     xmm7, xmm7, 11111111b       ;k7
    106     punpcklqdq  xmm2, xmm2
    107     punpcklqdq  xmm3, xmm3
    108     punpcklwd   xmm0, xmm1
    109     punpckhwd   xmm6, xmm7
    110     punpckhwd   xmm2, xmm5
    111     punpckhwd   xmm3, xmm4
    112 
    113     movdqa      k0k1, xmm0                  ;store filter factors on stack
    114     movdqa      k6k7, xmm6
    115     movdqa      k2k5, xmm2
    116     movdqa      k3k4, xmm3
    117 
    118     movq        xmm6, rcx
    119     pshufd      xmm6, xmm6, 0
    120     movdqa      krd, xmm6                   ;rounding
    121 
    122     ;Compute max and min values of a pixel
    123     mov         rdx, 0x00010001
    124     movsxd      rcx, DWORD PTR arg(6)       ;bps
    125     movq        xmm0, rdx
    126     movq        xmm1, rcx
    127     pshufd      xmm0, xmm0, 0b
    128     movdqa      xmm2, xmm0
    129     psllw       xmm0, xmm1
    130     psubw       xmm0, xmm2
    131     pxor        xmm1, xmm1
    132     movdqa      max, xmm0                  ;max value (for clamping)
    133     movdqa      min, xmm1                  ;min value (for clamping)
    134 %endm
    135 
    136 %macro LOAD_VERT_8 1
    137     movdqu      xmm0, [rsi + %1]            ;0
    138     movdqu      xmm1, [rsi + rax + %1]      ;1
    139     movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
    140     lea         rsi,  [rsi + rax]
    141     movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
    142     movdqu      xmm2, [rsi + rax + %1]      ;2
    143     movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
    144     movdqu      xmm4, [rsi + rdx + %1]      ;4
    145     movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
    146 %endm
    147 
    148 %macro HIGH_APPLY_FILTER_8 2
    149     movdqu      temp, xmm4
    150     movdqa      xmm4, xmm0
    151     punpcklwd   xmm0, xmm1
    152     punpckhwd   xmm4, xmm1
    153     movdqa      xmm1, xmm6
    154     punpcklwd   xmm6, xmm7
    155     punpckhwd   xmm1, xmm7
    156     movdqa      xmm7, xmm2
    157     punpcklwd   xmm2, xmm5
    158     punpckhwd   xmm7, xmm5
    159 
    160     movdqu      xmm5, temp
    161     movdqu      temp, xmm4
    162     movdqa      xmm4, xmm3
    163     punpcklwd   xmm3, xmm5
    164     punpckhwd   xmm4, xmm5
    165     movdqu      xmm5, temp
    166 
    167     pmaddwd     xmm0, k0k1
    168     pmaddwd     xmm5, k0k1
    169     pmaddwd     xmm6, k6k7
    170     pmaddwd     xmm1, k6k7
    171     pmaddwd     xmm2, k2k5
    172     pmaddwd     xmm7, k2k5
    173     pmaddwd     xmm3, k3k4
    174     pmaddwd     xmm4, k3k4
    175 
    176     paddd       xmm0, xmm6
    177     paddd       xmm0, xmm2
    178     paddd       xmm0, xmm3
    179     paddd       xmm5, xmm1
    180     paddd       xmm5, xmm7
    181     paddd       xmm5, xmm4
    182 
    183     paddd       xmm0, krd                   ;rounding
    184     paddd       xmm5, krd
    185     psrad       xmm0, 7                     ;shift
    186     psrad       xmm5, 7
    187     packssdw    xmm0, xmm5                  ;pack back to word
    188 
    189     ;clamp the values
    190     pminsw      xmm0, max
    191     pmaxsw      xmm0, min
    192 
    193 %if %1
    194     movdqu      xmm1, [rdi + %2]
    195     pavgw       xmm0, xmm1
    196 %endif
    197     movdqu      [rdi + %2], xmm0
    198 %endm
    199 
    200 ;void vpx_filter_block1d4_v8_sse2
    201 ;(
    202 ;    unsigned char *src_ptr,
    203 ;    unsigned int   src_pitch,
    204 ;    unsigned char *output_ptr,
    205 ;    unsigned int   out_pitch,
    206 ;    unsigned int   output_height,
    207 ;    short *filter
    208 ;)
    209 global sym(vpx_highbd_filter_block1d4_v8_sse2) PRIVATE
    210 sym(vpx_highbd_filter_block1d4_v8_sse2):
    211     push        rbp
    212     mov         rbp, rsp
    213     SHADOW_ARGS_TO_STACK 7
    214     SAVE_XMM 7
    215     push        rsi
    216     push        rdi
    217     push        rbx
    218     ; end prolog
    219 
    220     ALIGN_STACK 16, rax
    221     sub         rsp, 16 * 7
    222     %define k0k6 [rsp + 16 * 0]
    223     %define k2k5 [rsp + 16 * 1]
    224     %define k3k4 [rsp + 16 * 2]
    225     %define k1k7 [rsp + 16 * 3]
    226     %define krd [rsp + 16 * 4]
    227     %define max [rsp + 16 * 5]
    228     %define min [rsp + 16 * 6]
    229 
    230     HIGH_GET_FILTERS_4
    231 
    232     mov         rsi, arg(0)                 ;src_ptr
    233     mov         rdi, arg(2)                 ;output_ptr
    234 
    235     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    236     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    237     lea         rax, [rax + rax]            ;bytes per line
    238     lea         rbx, [rbx + rbx]
    239     lea         rdx, [rax + rax * 2]
    240     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    241 
    242 .loop:
    243     movq        xmm0, [rsi]                 ;load src: row 0
    244     movq        xmm1, [rsi + rax]           ;1
    245     movq        xmm6, [rsi + rdx * 2]       ;6
    246     lea         rsi,  [rsi + rax]
    247     movq        xmm7, [rsi + rdx * 2]       ;7
    248     movq        xmm2, [rsi + rax]           ;2
    249     movq        xmm3, [rsi + rax * 2]       ;3
    250     movq        xmm4, [rsi + rdx]           ;4
    251     movq        xmm5, [rsi + rax * 4]       ;5
    252 
    253     HIGH_APPLY_FILTER_4 0
    254 
    255     lea         rdi, [rdi + rbx]
    256     dec         rcx
    257     jnz         .loop
    258 
    259     add rsp, 16 * 7
    260     pop rsp
    261     pop rbx
    262     ; begin epilog
    263     pop rdi
    264     pop rsi
    265     RESTORE_XMM
    266     UNSHADOW_ARGS
    267     pop         rbp
    268     ret
    269 
    270 ;void vpx_filter_block1d8_v8_sse2
    271 ;(
    272 ;    unsigned char *src_ptr,
    273 ;    unsigned int   src_pitch,
    274 ;    unsigned char *output_ptr,
    275 ;    unsigned int   out_pitch,
    276 ;    unsigned int   output_height,
    277 ;    short *filter
    278 ;)
    279 global sym(vpx_highbd_filter_block1d8_v8_sse2) PRIVATE
    280 sym(vpx_highbd_filter_block1d8_v8_sse2):
    281     push        rbp
    282     mov         rbp, rsp
    283     SHADOW_ARGS_TO_STACK 7
    284     SAVE_XMM 7
    285     push        rsi
    286     push        rdi
    287     push        rbx
    288     ; end prolog
    289 
    290     ALIGN_STACK 16, rax
    291     sub         rsp, 16 * 8
    292     %define k0k1 [rsp + 16 * 0]
    293     %define k6k7 [rsp + 16 * 1]
    294     %define k2k5 [rsp + 16 * 2]
    295     %define k3k4 [rsp + 16 * 3]
    296     %define krd [rsp + 16 * 4]
    297     %define temp [rsp + 16 * 5]
    298     %define max [rsp + 16 * 6]
    299     %define min [rsp + 16 * 7]
    300 
    301     HIGH_GET_FILTERS
    302 
    303     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    304     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    305     lea         rax, [rax + rax]            ;bytes per line
    306     lea         rbx, [rbx + rbx]
    307     lea         rdx, [rax + rax * 2]
    308     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    309 
    310 .loop:
    311     LOAD_VERT_8 0
    312     HIGH_APPLY_FILTER_8 0, 0
    313 
    314     lea         rdi, [rdi + rbx]
    315     dec         rcx
    316     jnz         .loop
    317 
    318     add rsp, 16 * 8
    319     pop rsp
    320     pop rbx
    321     ; begin epilog
    322     pop rdi
    323     pop rsi
    324     RESTORE_XMM
    325     UNSHADOW_ARGS
    326     pop         rbp
    327     ret
    328 
    329 ;void vpx_filter_block1d16_v8_sse2
    330 ;(
    331 ;    unsigned char *src_ptr,
    332 ;    unsigned int   src_pitch,
    333 ;    unsigned char *output_ptr,
    334 ;    unsigned int   out_pitch,
    335 ;    unsigned int   output_height,
    336 ;    short *filter
    337 ;)
    338 global sym(vpx_highbd_filter_block1d16_v8_sse2) PRIVATE
    339 sym(vpx_highbd_filter_block1d16_v8_sse2):
    340     push        rbp
    341     mov         rbp, rsp
    342     SHADOW_ARGS_TO_STACK 7
    343     SAVE_XMM 7
    344     push        rsi
    345     push        rdi
    346     push        rbx
    347     ; end prolog
    348 
    349     ALIGN_STACK 16, rax
    350     sub         rsp, 16 * 8
    351     %define k0k1 [rsp + 16 * 0]
    352     %define k6k7 [rsp + 16 * 1]
    353     %define k2k5 [rsp + 16 * 2]
    354     %define k3k4 [rsp + 16 * 3]
    355     %define krd [rsp + 16 * 4]
    356     %define temp [rsp + 16 * 5]
    357     %define max [rsp + 16 * 6]
    358     %define min [rsp + 16 * 7]
    359 
    360     HIGH_GET_FILTERS
    361 
    362     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    363     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    364     lea         rax, [rax + rax]            ;bytes per line
    365     lea         rbx, [rbx + rbx]
    366     lea         rdx, [rax + rax * 2]
    367     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    368 
    369 .loop:
    370     LOAD_VERT_8 0
    371     HIGH_APPLY_FILTER_8 0, 0
    372     sub         rsi, rax
    373 
    374     LOAD_VERT_8 16
    375     HIGH_APPLY_FILTER_8 0, 16
    376     add         rdi, rbx
    377 
    378     dec         rcx
    379     jnz         .loop
    380 
    381     add rsp, 16 * 8
    382     pop rsp
    383     pop rbx
    384     ; begin epilog
    385     pop rdi
    386     pop rsi
    387     RESTORE_XMM
    388     UNSHADOW_ARGS
    389     pop         rbp
    390     ret
    391 
    392 global sym(vpx_highbd_filter_block1d4_v8_avg_sse2) PRIVATE
    393 sym(vpx_highbd_filter_block1d4_v8_avg_sse2):
    394     push        rbp
    395     mov         rbp, rsp
    396     SHADOW_ARGS_TO_STACK 7
    397     SAVE_XMM 7
    398     push        rsi
    399     push        rdi
    400     push        rbx
    401     ; end prolog
    402 
    403     ALIGN_STACK 16, rax
    404     sub         rsp, 16 * 7
    405     %define k0k6 [rsp + 16 * 0]
    406     %define k2k5 [rsp + 16 * 1]
    407     %define k3k4 [rsp + 16 * 2]
    408     %define k1k7 [rsp + 16 * 3]
    409     %define krd [rsp + 16 * 4]
    410     %define max [rsp + 16 * 5]
    411     %define min [rsp + 16 * 6]
    412 
    413     HIGH_GET_FILTERS_4
    414 
    415     mov         rsi, arg(0)                 ;src_ptr
    416     mov         rdi, arg(2)                 ;output_ptr
    417 
    418     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    419     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    420     lea         rax, [rax + rax]            ;bytes per line
    421     lea         rbx, [rbx + rbx]
    422     lea         rdx, [rax + rax * 2]
    423     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    424 
    425 .loop:
    426     movq        xmm0, [rsi]                 ;load src: row 0
    427     movq        xmm1, [rsi + rax]           ;1
    428     movq        xmm6, [rsi + rdx * 2]       ;6
    429     lea         rsi,  [rsi + rax]
    430     movq        xmm7, [rsi + rdx * 2]       ;7
    431     movq        xmm2, [rsi + rax]           ;2
    432     movq        xmm3, [rsi + rax * 2]       ;3
    433     movq        xmm4, [rsi + rdx]           ;4
    434     movq        xmm5, [rsi + rax * 4]       ;5
    435 
    436     HIGH_APPLY_FILTER_4 1
    437 
    438     lea         rdi, [rdi + rbx]
    439     dec         rcx
    440     jnz         .loop
    441 
    442     add rsp, 16 * 7
    443     pop rsp
    444     pop rbx
    445     ; begin epilog
    446     pop rdi
    447     pop rsi
    448     RESTORE_XMM
    449     UNSHADOW_ARGS
    450     pop         rbp
    451     ret
    452 
    453 global sym(vpx_highbd_filter_block1d8_v8_avg_sse2) PRIVATE
    454 sym(vpx_highbd_filter_block1d8_v8_avg_sse2):
    455     push        rbp
    456     mov         rbp, rsp
    457     SHADOW_ARGS_TO_STACK 7
    458     SAVE_XMM 7
    459     push        rsi
    460     push        rdi
    461     push        rbx
    462     ; end prolog
    463 
    464     ALIGN_STACK 16, rax
    465     sub         rsp, 16 * 8
    466     %define k0k1 [rsp + 16 * 0]
    467     %define k6k7 [rsp + 16 * 1]
    468     %define k2k5 [rsp + 16 * 2]
    469     %define k3k4 [rsp + 16 * 3]
    470     %define krd [rsp + 16 * 4]
    471     %define temp [rsp + 16 * 5]
    472     %define max [rsp + 16 * 6]
    473     %define min [rsp + 16 * 7]
    474 
    475     HIGH_GET_FILTERS
    476 
    477     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    478     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    479     lea         rax, [rax + rax]            ;bytes per line
    480     lea         rbx, [rbx + rbx]
    481     lea         rdx, [rax + rax * 2]
    482     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    483 .loop:
    484     LOAD_VERT_8 0
    485     HIGH_APPLY_FILTER_8 1, 0
    486 
    487     lea         rdi, [rdi + rbx]
    488     dec         rcx
    489     jnz         .loop
    490 
    491     add rsp, 16 * 8
    492     pop rsp
    493     pop rbx
    494     ; begin epilog
    495     pop rdi
    496     pop rsi
    497     RESTORE_XMM
    498     UNSHADOW_ARGS
    499     pop         rbp
    500     ret
    501 
    502 global sym(vpx_highbd_filter_block1d16_v8_avg_sse2) PRIVATE
    503 sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
    504     push        rbp
    505     mov         rbp, rsp
    506     SHADOW_ARGS_TO_STACK 7
    507     SAVE_XMM 7
    508     push        rsi
    509     push        rdi
    510     push        rbx
    511     ; end prolog
    512 
    513     ALIGN_STACK 16, rax
    514     sub         rsp, 16 * 8
    515     %define k0k1 [rsp + 16 * 0]
    516     %define k6k7 [rsp + 16 * 1]
    517     %define k2k5 [rsp + 16 * 2]
    518     %define k3k4 [rsp + 16 * 3]
    519     %define krd [rsp + 16 * 4]
    520     %define temp [rsp + 16 * 5]
    521     %define max [rsp + 16 * 6]
    522     %define min [rsp + 16 * 7]
    523 
    524     HIGH_GET_FILTERS
    525 
    526     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    527     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    528     lea         rax, [rax + rax]            ;bytes per line
    529     lea         rbx, [rbx + rbx]
    530     lea         rdx, [rax + rax * 2]
    531     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    532 .loop:
    533     LOAD_VERT_8 0
    534     HIGH_APPLY_FILTER_8 1, 0
    535     sub         rsi, rax
    536 
    537     LOAD_VERT_8 16
    538     HIGH_APPLY_FILTER_8 1, 16
    539     add         rdi, rbx
    540 
    541     dec         rcx
    542     jnz         .loop
    543 
    544     add rsp, 16 * 8
    545     pop rsp
    546     pop rbx
    547     ; begin epilog
    548     pop rdi
    549     pop rsi
    550     RESTORE_XMM
    551     UNSHADOW_ARGS
    552     pop         rbp
    553     ret
    554 
    555 ;void vpx_filter_block1d4_h8_sse2
    556 ;(
    557 ;    unsigned char  *src_ptr,
    558 ;    unsigned int    src_pixels_per_line,
    559 ;    unsigned char  *output_ptr,
    560 ;    unsigned int    output_pitch,
    561 ;    unsigned int    output_height,
    562 ;    short *filter
    563 ;)
    564 global sym(vpx_highbd_filter_block1d4_h8_sse2) PRIVATE
    565 sym(vpx_highbd_filter_block1d4_h8_sse2):
    566     push        rbp
    567     mov         rbp, rsp
    568     SHADOW_ARGS_TO_STACK 7
    569     SAVE_XMM 7
    570     push        rsi
    571     push        rdi
    572     ; end prolog
    573 
    574     ALIGN_STACK 16, rax
    575     sub         rsp, 16 * 7
    576     %define k0k6 [rsp + 16 * 0]
    577     %define k2k5 [rsp + 16 * 1]
    578     %define k3k4 [rsp + 16 * 2]
    579     %define k1k7 [rsp + 16 * 3]
    580     %define krd [rsp + 16 * 4]
    581     %define max [rsp + 16 * 5]
    582     %define min [rsp + 16 * 6]
    583 
    584     HIGH_GET_FILTERS_4
    585 
    586     mov         rsi, arg(0)                 ;src_ptr
    587     mov         rdi, arg(2)                 ;output_ptr
    588 
    589     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    590     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    591     lea         rax, [rax + rax]            ;bytes per line
    592     lea         rdx, [rdx + rdx]
    593     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    594 
    595 .loop:
    596     movdqu      xmm0,   [rsi - 6]           ;load src
    597     movdqu      xmm4,   [rsi + 2]
    598     movdqa      xmm1, xmm0
    599     movdqa      xmm6, xmm4
    600     movdqa      xmm7, xmm4
    601     movdqa      xmm2, xmm0
    602     movdqa      xmm3, xmm0
    603     movdqa      xmm5, xmm4
    604 
    605     psrldq      xmm1, 2
    606     psrldq      xmm6, 4
    607     psrldq      xmm7, 6
    608     psrldq      xmm2, 4
    609     psrldq      xmm3, 6
    610     psrldq      xmm5, 2
    611 
    612     HIGH_APPLY_FILTER_4 0
    613 
    614     lea         rsi, [rsi + rax]
    615     lea         rdi, [rdi + rdx]
    616     dec         rcx
    617     jnz         .loop
    618 
    619     add rsp, 16 * 7
    620     pop rsp
    621 
    622     ; begin epilog
    623     pop rdi
    624     pop rsi
    625     RESTORE_XMM
    626     UNSHADOW_ARGS
    627     pop         rbp
    628     ret
    629 
    630 ;void vpx_filter_block1d8_h8_sse2
    631 ;(
    632 ;    unsigned char  *src_ptr,
    633 ;    unsigned int    src_pixels_per_line,
    634 ;    unsigned char  *output_ptr,
    635 ;    unsigned int    output_pitch,
    636 ;    unsigned int    output_height,
    637 ;    short *filter
    638 ;)
    639 global sym(vpx_highbd_filter_block1d8_h8_sse2) PRIVATE
    640 sym(vpx_highbd_filter_block1d8_h8_sse2):
    641     push        rbp
    642     mov         rbp, rsp
    643     SHADOW_ARGS_TO_STACK 7
    644     SAVE_XMM 7
    645     push        rsi
    646     push        rdi
    647     ; end prolog
    648 
    649     ALIGN_STACK 16, rax
    650     sub         rsp, 16 * 8
    651     %define k0k1 [rsp + 16 * 0]
    652     %define k6k7 [rsp + 16 * 1]
    653     %define k2k5 [rsp + 16 * 2]
    654     %define k3k4 [rsp + 16 * 3]
    655     %define krd [rsp + 16 * 4]
    656     %define temp [rsp + 16 * 5]
    657     %define max [rsp + 16 * 6]
    658     %define min [rsp + 16 * 7]
    659 
    660     HIGH_GET_FILTERS
    661 
    662     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    663     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    664     lea         rax, [rax + rax]            ;bytes per line
    665     lea         rdx, [rdx + rdx]
    666     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    667 
    668 .loop:
    669     movdqu      xmm0,   [rsi - 6]           ;load src
    670     movdqu      xmm1,   [rsi - 4]
    671     movdqu      xmm2,   [rsi - 2]
    672     movdqu      xmm3,   [rsi]
    673     movdqu      xmm4,   [rsi + 2]
    674     movdqu      xmm5,   [rsi + 4]
    675     movdqu      xmm6,   [rsi + 6]
    676     movdqu      xmm7,   [rsi + 8]
    677 
    678     HIGH_APPLY_FILTER_8 0, 0
    679 
    680     lea         rsi, [rsi + rax]
    681     lea         rdi, [rdi + rdx]
    682     dec         rcx
    683     jnz         .loop
    684 
    685     add rsp, 16 * 8
    686     pop rsp
    687 
    688     ; begin epilog
    689     pop rdi
    690     pop rsi
    691     RESTORE_XMM
    692     UNSHADOW_ARGS
    693     pop         rbp
    694     ret
    695 
    696 ;void vpx_filter_block1d16_h8_sse2
    697 ;(
    698 ;    unsigned char  *src_ptr,
    699 ;    unsigned int    src_pixels_per_line,
    700 ;    unsigned char  *output_ptr,
    701 ;    unsigned int    output_pitch,
    702 ;    unsigned int    output_height,
    703 ;    short *filter
    704 ;)
    705 global sym(vpx_highbd_filter_block1d16_h8_sse2) PRIVATE
    706 sym(vpx_highbd_filter_block1d16_h8_sse2):
    707     push        rbp
    708     mov         rbp, rsp
    709     SHADOW_ARGS_TO_STACK 7
    710     SAVE_XMM 7
    711     push        rsi
    712     push        rdi
    713     ; end prolog
    714 
    715     ALIGN_STACK 16, rax
    716     sub         rsp, 16 * 8
    717     %define k0k1 [rsp + 16 * 0]
    718     %define k6k7 [rsp + 16 * 1]
    719     %define k2k5 [rsp + 16 * 2]
    720     %define k3k4 [rsp + 16 * 3]
    721     %define krd [rsp + 16 * 4]
    722     %define temp [rsp + 16 * 5]
    723     %define max [rsp + 16 * 6]
    724     %define min [rsp + 16 * 7]
    725 
    726     HIGH_GET_FILTERS
    727 
    728     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    729     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    730     lea         rax, [rax + rax]            ;bytes per line
    731     lea         rdx, [rdx + rdx]
    732     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    733 
    734 .loop:
    735     movdqu      xmm0,   [rsi - 6]           ;load src
    736     movdqu      xmm1,   [rsi - 4]
    737     movdqu      xmm2,   [rsi - 2]
    738     movdqu      xmm3,   [rsi]
    739     movdqu      xmm4,   [rsi + 2]
    740     movdqu      xmm5,   [rsi + 4]
    741     movdqu      xmm6,   [rsi + 6]
    742     movdqu      xmm7,   [rsi + 8]
    743 
    744     HIGH_APPLY_FILTER_8 0, 0
    745 
    746     movdqu      xmm0,   [rsi + 10]           ;load src
    747     movdqu      xmm1,   [rsi + 12]
    748     movdqu      xmm2,   [rsi + 14]
    749     movdqu      xmm3,   [rsi + 16]
    750     movdqu      xmm4,   [rsi + 18]
    751     movdqu      xmm5,   [rsi + 20]
    752     movdqu      xmm6,   [rsi + 22]
    753     movdqu      xmm7,   [rsi + 24]
    754 
    755     HIGH_APPLY_FILTER_8 0, 16
    756 
    757     lea         rsi, [rsi + rax]
    758     lea         rdi, [rdi + rdx]
    759     dec         rcx
    760     jnz         .loop
    761 
    762     add rsp, 16 * 8
    763     pop rsp
    764 
    765     ; begin epilog
    766     pop rdi
    767     pop rsi
    768     RESTORE_XMM
    769     UNSHADOW_ARGS
    770     pop         rbp
    771     ret
    772 
    773 global sym(vpx_highbd_filter_block1d4_h8_avg_sse2) PRIVATE
    774 sym(vpx_highbd_filter_block1d4_h8_avg_sse2):
    775     push        rbp
    776     mov         rbp, rsp
    777     SHADOW_ARGS_TO_STACK 7
    778     SAVE_XMM 7
    779     push        rsi
    780     push        rdi
    781     ; end prolog
    782 
    783     ALIGN_STACK 16, rax
    784     sub         rsp, 16 * 7
    785     %define k0k6 [rsp + 16 * 0]
    786     %define k2k5 [rsp + 16 * 1]
    787     %define k3k4 [rsp + 16 * 2]
    788     %define k1k7 [rsp + 16 * 3]
    789     %define krd [rsp + 16 * 4]
    790     %define max [rsp + 16 * 5]
    791     %define min [rsp + 16 * 6]
    792 
    793     HIGH_GET_FILTERS_4
    794 
    795     mov         rsi, arg(0)                 ;src_ptr
    796     mov         rdi, arg(2)                 ;output_ptr
    797 
    798     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    799     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    800     lea         rax, [rax + rax]            ;bytes per line
    801     lea         rdx, [rdx + rdx]
    802     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    803 
    804 .loop:
    805     movdqu      xmm0,   [rsi - 6]           ;load src
    806     movdqu      xmm4,   [rsi + 2]
    807     movdqa      xmm1, xmm0
    808     movdqa      xmm6, xmm4
    809     movdqa      xmm7, xmm4
    810     movdqa      xmm2, xmm0
    811     movdqa      xmm3, xmm0
    812     movdqa      xmm5, xmm4
    813 
    814     psrldq      xmm1, 2
    815     psrldq      xmm6, 4
    816     psrldq      xmm7, 6
    817     psrldq      xmm2, 4
    818     psrldq      xmm3, 6
    819     psrldq      xmm5, 2
    820 
    821     HIGH_APPLY_FILTER_4 1
    822 
    823     lea         rsi, [rsi + rax]
    824     lea         rdi, [rdi + rdx]
    825     dec         rcx
    826     jnz         .loop
    827 
    828     add rsp, 16 * 7
    829     pop rsp
    830 
    831     ; begin epilog
    832     pop rdi
    833     pop rsi
    834     RESTORE_XMM
    835     UNSHADOW_ARGS
    836     pop         rbp
    837     ret
    838 
    839 global sym(vpx_highbd_filter_block1d8_h8_avg_sse2) PRIVATE
    840 sym(vpx_highbd_filter_block1d8_h8_avg_sse2):
    841     push        rbp
    842     mov         rbp, rsp
    843     SHADOW_ARGS_TO_STACK 7
    844     SAVE_XMM 7
    845     push        rsi
    846     push        rdi
    847     ; end prolog
    848 
    849     ALIGN_STACK 16, rax
    850     sub         rsp, 16 * 8
    851     %define k0k1 [rsp + 16 * 0]
    852     %define k6k7 [rsp + 16 * 1]
    853     %define k2k5 [rsp + 16 * 2]
    854     %define k3k4 [rsp + 16 * 3]
    855     %define krd [rsp + 16 * 4]
    856     %define temp [rsp + 16 * 5]
    857     %define max [rsp + 16 * 6]
    858     %define min [rsp + 16 * 7]
    859 
    860     HIGH_GET_FILTERS
    861 
    862     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    863     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    864     lea         rax, [rax + rax]            ;bytes per line
    865     lea         rdx, [rdx + rdx]
    866     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    867 
    868 .loop:
    869     movdqu      xmm0,   [rsi - 6]           ;load src
    870     movdqu      xmm1,   [rsi - 4]
    871     movdqu      xmm2,   [rsi - 2]
    872     movdqu      xmm3,   [rsi]
    873     movdqu      xmm4,   [rsi + 2]
    874     movdqu      xmm5,   [rsi + 4]
    875     movdqu      xmm6,   [rsi + 6]
    876     movdqu      xmm7,   [rsi + 8]
    877 
    878     HIGH_APPLY_FILTER_8 1, 0
    879 
    880     lea         rsi, [rsi + rax]
    881     lea         rdi, [rdi + rdx]
    882     dec         rcx
    883     jnz         .loop
    884 
    885     add rsp, 16 * 8
    886     pop rsp
    887 
    888     ; begin epilog
    889     pop rdi
    890     pop rsi
    891     RESTORE_XMM
    892     UNSHADOW_ARGS
    893     pop         rbp
    894     ret
    895 
    896 global sym(vpx_highbd_filter_block1d16_h8_avg_sse2) PRIVATE
    897 sym(vpx_highbd_filter_block1d16_h8_avg_sse2):
    898     push        rbp
    899     mov         rbp, rsp
    900     SHADOW_ARGS_TO_STACK 7
    901     SAVE_XMM 7
    902     push        rsi
    903     push        rdi
    904     ; end prolog
    905 
    906     ALIGN_STACK 16, rax
    907     sub         rsp, 16 * 8
    908     %define k0k1 [rsp + 16 * 0]
    909     %define k6k7 [rsp + 16 * 1]
    910     %define k2k5 [rsp + 16 * 2]
    911     %define k3k4 [rsp + 16 * 3]
    912     %define krd [rsp + 16 * 4]
    913     %define temp [rsp + 16 * 5]
    914     %define max [rsp + 16 * 6]
    915     %define min [rsp + 16 * 7]
    916 
    917     HIGH_GET_FILTERS
    918 
    919     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    920     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    921     lea         rax, [rax + rax]            ;bytes per line
    922     lea         rdx, [rdx + rdx]
    923     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    924 
    925 .loop:
    926     movdqu      xmm0,   [rsi - 6]           ;load src
    927     movdqu      xmm1,   [rsi - 4]
    928     movdqu      xmm2,   [rsi - 2]
    929     movdqu      xmm3,   [rsi]
    930     movdqu      xmm4,   [rsi + 2]
    931     movdqu      xmm5,   [rsi + 4]
    932     movdqu      xmm6,   [rsi + 6]
    933     movdqu      xmm7,   [rsi + 8]
    934 
    935     HIGH_APPLY_FILTER_8 1, 0
    936 
    937     movdqu      xmm0,   [rsi + 10]           ;load src
    938     movdqu      xmm1,   [rsi + 12]
    939     movdqu      xmm2,   [rsi + 14]
    940     movdqu      xmm3,   [rsi + 16]
    941     movdqu      xmm4,   [rsi + 18]
    942     movdqu      xmm5,   [rsi + 20]
    943     movdqu      xmm6,   [rsi + 22]
    944     movdqu      xmm7,   [rsi + 24]
    945 
    946     HIGH_APPLY_FILTER_8 1, 16
    947 
    948     lea         rsi, [rsi + rax]
    949     lea         rdi, [rdi + rdx]
    950     dec         rcx
    951     jnz         .loop
    952 
    953     add rsp, 16 * 8
    954     pop rsp
    955 
    956     ; begin epilog
    957     pop rdi
    958     pop rsi
    959     RESTORE_XMM
    960     UNSHADOW_ARGS
    961     pop         rbp
    962     ret
    963