Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 %macro GET_PARAM_4 0
     14     mov         rdx, arg(5)                 ;filter ptr
     15     mov         rsi, arg(0)                 ;src_ptr
     16     mov         rdi, arg(2)                 ;output_ptr
     17     mov         rcx, 0x0400040
     18 
     19     movdqa      xmm3, [rdx]                 ;load filters
     20     pshuflw     xmm4, xmm3, 11111111b       ;k3
     21     psrldq      xmm3, 8
     22     pshuflw     xmm3, xmm3, 0b              ;k4
     23     punpcklqdq  xmm4, xmm3                  ;k3k4
     24 
     25     movq        xmm3, rcx                   ;rounding
     26     pshufd      xmm3, xmm3, 0
     27 
     28     pxor        xmm2, xmm2
     29 
     30     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     31     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     32     movsxd      rcx, DWORD PTR arg(4)       ;output_height
     33 %endm
     34 
     35 %macro APPLY_FILTER_4 1
     36 
     37     punpckldq   xmm0, xmm1                  ;two row in one register
     38     punpcklbw   xmm0, xmm2                  ;unpack to word
     39     pmullw      xmm0, xmm4                  ;multiply the filter factors
     40 
     41     movdqa      xmm1, xmm0
     42     psrldq      xmm1, 8
     43     paddsw      xmm0, xmm1
     44 
     45     paddsw      xmm0, xmm3                  ;rounding
     46     psraw       xmm0, 7                     ;shift
     47     packuswb    xmm0, xmm0                  ;pack to byte
     48 
     49 %if %1
     50     movd        xmm1, [rdi]
     51     pavgb       xmm0, xmm1
     52 %endif
     53 
     54     movd        [rdi], xmm0
     55     lea         rsi, [rsi + rax]
     56     lea         rdi, [rdi + rdx]
     57     dec         rcx
     58 %endm
     59 
     60 %macro GET_PARAM 0
     61     mov         rdx, arg(5)                 ;filter ptr
     62     mov         rsi, arg(0)                 ;src_ptr
     63     mov         rdi, arg(2)                 ;output_ptr
     64     mov         rcx, 0x0400040
     65 
     66     movdqa      xmm7, [rdx]                 ;load filters
     67 
     68     pshuflw     xmm6, xmm7, 11111111b       ;k3
     69     pshufhw     xmm7, xmm7, 0b              ;k4
     70     punpcklwd   xmm6, xmm6
     71     punpckhwd   xmm7, xmm7
     72 
     73     movq        xmm4, rcx                   ;rounding
     74     pshufd      xmm4, xmm4, 0
     75 
     76     pxor        xmm5, xmm5
     77 
     78     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     79     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     80     movsxd      rcx, DWORD PTR arg(4)       ;output_height
     81 %endm
     82 
     83 %macro APPLY_FILTER_8 1
     84     punpcklbw   xmm0, xmm5
     85     punpcklbw   xmm1, xmm5
     86 
     87     pmullw      xmm0, xmm6
     88     pmullw      xmm1, xmm7
     89     paddsw      xmm0, xmm1
     90     paddsw      xmm0, xmm4                  ;rounding
     91     psraw       xmm0, 7                     ;shift
     92     packuswb    xmm0, xmm0                  ;pack back to byte
     93 %if %1
     94     movq        xmm1, [rdi]
     95     pavgb       xmm0, xmm1
     96 %endif
     97     movq        [rdi], xmm0                 ;store the result
     98 
     99     lea         rsi, [rsi + rax]
    100     lea         rdi, [rdi + rdx]
    101     dec         rcx
    102 %endm
    103 
    104 %macro APPLY_FILTER_16 1
    105     punpcklbw   xmm0, xmm5
    106     punpcklbw   xmm1, xmm5
    107     punpckhbw   xmm2, xmm5
    108     punpckhbw   xmm3, xmm5
    109 
    110     pmullw      xmm0, xmm6
    111     pmullw      xmm1, xmm7
    112     pmullw      xmm2, xmm6
    113     pmullw      xmm3, xmm7
    114 
    115     paddsw      xmm0, xmm1
    116     paddsw      xmm2, xmm3
    117 
    118     paddsw      xmm0, xmm4                  ;rounding
    119     paddsw      xmm2, xmm4
    120     psraw       xmm0, 7                     ;shift
    121     psraw       xmm2, 7
    122     packuswb    xmm0, xmm2                  ;pack back to byte
    123 %if %1
    124     movdqu      xmm1, [rdi]
    125     pavgb       xmm0, xmm1
    126 %endif
    127     movdqu      [rdi], xmm0                 ;store the result
    128 
    129     lea         rsi, [rsi + rax]
    130     lea         rdi, [rdi + rdx]
    131     dec         rcx
    132 %endm
    133 
    134 global sym(vp9_filter_block1d4_v2_sse2) PRIVATE
    135 sym(vp9_filter_block1d4_v2_sse2):
    136     push        rbp
    137     mov         rbp, rsp
    138     SHADOW_ARGS_TO_STACK 6
    139     push        rsi
    140     push        rdi
    141     ; end prolog
    142 
    143     GET_PARAM_4
    144 .loop:
    145     movd        xmm0, [rsi]                 ;load src
    146     movd        xmm1, [rsi + rax]
    147 
    148     APPLY_FILTER_4 0
    149     jnz         .loop
    150 
    151     ; begin epilog
    152     pop         rdi
    153     pop         rsi
    154     UNSHADOW_ARGS
    155     pop         rbp
    156     ret
    157 
    158 global sym(vp9_filter_block1d8_v2_sse2) PRIVATE
    159 sym(vp9_filter_block1d8_v2_sse2):
    160     push        rbp
    161     mov         rbp, rsp
    162     SHADOW_ARGS_TO_STACK 6
    163     SAVE_XMM 7
    164     push        rsi
    165     push        rdi
    166     ; end prolog
    167 
    168     GET_PARAM
    169 .loop:
    170     movq        xmm0, [rsi]                 ;0
    171     movq        xmm1, [rsi + rax]           ;1
    172 
    173     APPLY_FILTER_8 0
    174     jnz         .loop
    175 
    176     ; begin epilog
    177     pop         rdi
    178     pop         rsi
    179     RESTORE_XMM
    180     UNSHADOW_ARGS
    181     pop         rbp
    182     ret
    183 
    184 global sym(vp9_filter_block1d16_v2_sse2) PRIVATE
    185 sym(vp9_filter_block1d16_v2_sse2):
    186     push        rbp
    187     mov         rbp, rsp
    188     SHADOW_ARGS_TO_STACK 6
    189     SAVE_XMM 7
    190     push        rsi
    191     push        rdi
    192     ; end prolog
    193 
    194     GET_PARAM
    195 .loop:
    196     movdqu        xmm0, [rsi]               ;0
    197     movdqu        xmm1, [rsi + rax]         ;1
    198     movdqa        xmm2, xmm0
    199     movdqa        xmm3, xmm1
    200 
    201     APPLY_FILTER_16 0
    202     jnz         .loop
    203 
    204     ; begin epilog
    205     pop         rdi
    206     pop         rsi
    207     RESTORE_XMM
    208     UNSHADOW_ARGS
    209     pop         rbp
    210     ret
    211 
    212 global sym(vp9_filter_block1d4_v2_avg_sse2) PRIVATE
    213 sym(vp9_filter_block1d4_v2_avg_sse2):
    214     push        rbp
    215     mov         rbp, rsp
    216     SHADOW_ARGS_TO_STACK 6
    217     push        rsi
    218     push        rdi
    219     ; end prolog
    220 
    221     GET_PARAM_4
    222 .loop:
    223     movd        xmm0, [rsi]                 ;load src
    224     movd        xmm1, [rsi + rax]
    225 
    226     APPLY_FILTER_4 1
    227     jnz         .loop
    228 
    229     ; begin epilog
    230     pop         rdi
    231     pop         rsi
    232     UNSHADOW_ARGS
    233     pop         rbp
    234     ret
    235 
    236 global sym(vp9_filter_block1d8_v2_avg_sse2) PRIVATE
    237 sym(vp9_filter_block1d8_v2_avg_sse2):
    238     push        rbp
    239     mov         rbp, rsp
    240     SHADOW_ARGS_TO_STACK 6
    241     SAVE_XMM 7
    242     push        rsi
    243     push        rdi
    244     ; end prolog
    245 
    246     GET_PARAM
    247 .loop:
    248     movq        xmm0, [rsi]                 ;0
    249     movq        xmm1, [rsi + rax]           ;1
    250 
    251     APPLY_FILTER_8 1
    252     jnz         .loop
    253 
    254     ; begin epilog
    255     pop         rdi
    256     pop         rsi
    257     RESTORE_XMM
    258     UNSHADOW_ARGS
    259     pop         rbp
    260     ret
    261 
    262 global sym(vp9_filter_block1d16_v2_avg_sse2) PRIVATE
    263 sym(vp9_filter_block1d16_v2_avg_sse2):
    264     push        rbp
    265     mov         rbp, rsp
    266     SHADOW_ARGS_TO_STACK 6
    267     SAVE_XMM 7
    268     push        rsi
    269     push        rdi
    270     ; end prolog
    271 
    272     GET_PARAM
    273 .loop:
    274     movdqu        xmm0, [rsi]               ;0
    275     movdqu        xmm1, [rsi + rax]         ;1
    276     movdqa        xmm2, xmm0
    277     movdqa        xmm3, xmm1
    278 
    279     APPLY_FILTER_16 1
    280     jnz         .loop
    281 
    282     ; begin epilog
    283     pop         rdi
    284     pop         rsi
    285     RESTORE_XMM
    286     UNSHADOW_ARGS
    287     pop         rbp
    288     ret
    289 
    290 global sym(vp9_filter_block1d4_h2_sse2) PRIVATE
    291 sym(vp9_filter_block1d4_h2_sse2):
    292     push        rbp
    293     mov         rbp, rsp
    294     SHADOW_ARGS_TO_STACK 6
    295     push        rsi
    296     push        rdi
    297     ; end prolog
    298 
    299     GET_PARAM_4
    300 .loop:
    301     movdqu      xmm0, [rsi]                 ;load src
    302     movdqa      xmm1, xmm0
    303     psrldq      xmm1, 1
    304 
    305     APPLY_FILTER_4 0
    306     jnz         .loop
    307 
    308     ; begin epilog
    309     pop         rdi
    310     pop         rsi
    311     UNSHADOW_ARGS
    312     pop         rbp
    313     ret
    314 
    315 global sym(vp9_filter_block1d8_h2_sse2) PRIVATE
    316 sym(vp9_filter_block1d8_h2_sse2):
    317     push        rbp
    318     mov         rbp, rsp
    319     SHADOW_ARGS_TO_STACK 6
    320     SAVE_XMM 7
    321     push        rsi
    322     push        rdi
    323     ; end prolog
    324 
    325     GET_PARAM
    326 .loop:
    327     movdqu      xmm0, [rsi]                 ;load src
    328     movdqa      xmm1, xmm0
    329     psrldq      xmm1, 1
    330 
    331     APPLY_FILTER_8 0
    332     jnz         .loop
    333 
    334     ; begin epilog
    335     pop         rdi
    336     pop         rsi
    337     RESTORE_XMM
    338     UNSHADOW_ARGS
    339     pop         rbp
    340     ret
    341 
    342 global sym(vp9_filter_block1d16_h2_sse2) PRIVATE
    343 sym(vp9_filter_block1d16_h2_sse2):
    344     push        rbp
    345     mov         rbp, rsp
    346     SHADOW_ARGS_TO_STACK 6
    347     SAVE_XMM 7
    348     push        rsi
    349     push        rdi
    350     ; end prolog
    351 
    352     GET_PARAM
    353 .loop:
    354     movdqu      xmm0,   [rsi]               ;load src
    355     movdqu      xmm1,   [rsi + 1]
    356     movdqa      xmm2, xmm0
    357     movdqa      xmm3, xmm1
    358 
    359     APPLY_FILTER_16 0
    360     jnz         .loop
    361 
    362     ; begin epilog
    363     pop         rdi
    364     pop         rsi
    365     RESTORE_XMM
    366     UNSHADOW_ARGS
    367     pop         rbp
    368     ret
    369 
    370 global sym(vp9_filter_block1d4_h2_avg_sse2) PRIVATE
    371 sym(vp9_filter_block1d4_h2_avg_sse2):
    372     push        rbp
    373     mov         rbp, rsp
    374     SHADOW_ARGS_TO_STACK 6
    375     push        rsi
    376     push        rdi
    377     ; end prolog
    378 
    379     GET_PARAM_4
    380 .loop:
    381     movdqu      xmm0, [rsi]                 ;load src
    382     movdqa      xmm1, xmm0
    383     psrldq      xmm1, 1
    384 
    385     APPLY_FILTER_4 1
    386     jnz         .loop
    387 
    388     ; begin epilog
    389     pop         rdi
    390     pop         rsi
    391     UNSHADOW_ARGS
    392     pop         rbp
    393     ret
    394 
    395 global sym(vp9_filter_block1d8_h2_avg_sse2) PRIVATE
    396 sym(vp9_filter_block1d8_h2_avg_sse2):
    397     push        rbp
    398     mov         rbp, rsp
    399     SHADOW_ARGS_TO_STACK 6
    400     SAVE_XMM 7
    401     push        rsi
    402     push        rdi
    403     ; end prolog
    404 
    405     GET_PARAM
    406 .loop:
    407     movdqu      xmm0, [rsi]                 ;load src
    408     movdqa      xmm1, xmm0
    409     psrldq      xmm1, 1
    410 
    411     APPLY_FILTER_8 1
    412     jnz         .loop
    413 
    414     ; begin epilog
    415     pop         rdi
    416     pop         rsi
    417     RESTORE_XMM
    418     UNSHADOW_ARGS
    419     pop         rbp
    420     ret
    421 
    422 global sym(vp9_filter_block1d16_h2_avg_sse2) PRIVATE
    423 sym(vp9_filter_block1d16_h2_avg_sse2):
    424     push        rbp
    425     mov         rbp, rsp
    426     SHADOW_ARGS_TO_STACK 6
    427     SAVE_XMM 7
    428     push        rsi
    429     push        rdi
    430     ; end prolog
    431 
    432     GET_PARAM
    433 .loop:
    434     movdqu      xmm0,   [rsi]               ;load src
    435     movdqu      xmm1,   [rsi + 1]
    436     movdqa      xmm2, xmm0
    437     movdqa      xmm3, xmm1
    438 
    439     APPLY_FILTER_16 1
    440     jnz         .loop
    441 
    442     ; begin epilog
    443     pop         rdi
    444     pop         rsi
    445     RESTORE_XMM
    446     UNSHADOW_ARGS
    447     pop         rbp
    448     ret
    449