Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 
     13 %macro HIGH_GET_PARAM_4 0
     14     mov         rdx, arg(5)                 ;filter ptr
     15     mov         rsi, arg(0)                 ;src_ptr
     16     mov         rdi, arg(2)                 ;output_ptr
     17     mov         rcx, 0x00000040
     18 
     19     movdqa      xmm3, [rdx]                 ;load filters
     20     pshuflw     xmm4, xmm3, 11111111b       ;k3
     21     psrldq      xmm3, 8
     22     pshuflw     xmm3, xmm3, 0b              ;k4
     23     punpcklwd   xmm4, xmm3                  ;k3k4
     24 
     25     movq        xmm3, rcx                   ;rounding
     26     pshufd      xmm3, xmm3, 0
     27 
     28     mov         rdx, 0x00010001
     29     movsxd      rcx, DWORD PTR arg(6)       ;bps
     30     movq        xmm5, rdx
     31     movq        xmm2, rcx
     32     pshufd      xmm5, xmm5, 0b
     33     movdqa      xmm1, xmm5
     34     psllw       xmm5, xmm2
     35     psubw       xmm5, xmm1                  ;max value (for clamping)
     36     pxor        xmm2, xmm2                  ;min value (for clamping)
     37 
     38     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     39     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     40     movsxd      rcx, DWORD PTR arg(4)       ;output_height
     41 %endm
     42 
     43 %macro HIGH_APPLY_FILTER_4 1
     44 
     45     punpcklwd   xmm0, xmm1                  ;two row in one register
     46     pmaddwd     xmm0, xmm4                  ;multiply the filter factors
     47 
     48     paddd       xmm0, xmm3                  ;rounding
     49     psrad       xmm0, 7                     ;shift
     50     packssdw    xmm0, xmm0                  ;pack to word
     51 
     52     ;clamp the values
     53     pminsw      xmm0, xmm5
     54     pmaxsw      xmm0, xmm2
     55 
     56 %if %1
     57     movq        xmm1, [rdi]
     58     pavgw       xmm0, xmm1
     59 %endif
     60 
     61     movq        [rdi], xmm0
     62     lea         rsi, [rsi + 2*rax]
     63     lea         rdi, [rdi + 2*rdx]
     64     dec         rcx
     65 %endm
     66 
     67 %if ARCH_X86_64
     68 %macro HIGH_GET_PARAM 0
     69     mov         rdx, arg(5)                 ;filter ptr
     70     mov         rsi, arg(0)                 ;src_ptr
     71     mov         rdi, arg(2)                 ;output_ptr
     72     mov         rcx, 0x00000040
     73 
     74     movdqa      xmm6, [rdx]                 ;load filters
     75 
     76     pshuflw     xmm7, xmm6, 11111111b       ;k3
     77     pshufhw     xmm6, xmm6, 0b              ;k4
     78     psrldq      xmm6, 8
     79     punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
     80 
     81     movq        xmm4, rcx                   ;rounding
     82     pshufd      xmm4, xmm4, 0
     83 
     84     mov         rdx, 0x00010001
     85     movsxd      rcx, DWORD PTR arg(6)       ;bps
     86     movq        xmm8, rdx
     87     movq        xmm5, rcx
     88     pshufd      xmm8, xmm8, 0b
     89     movdqa      xmm1, xmm8
     90     psllw       xmm8, xmm5
     91     psubw       xmm8, xmm1                  ;max value (for clamping)
     92     pxor        xmm5, xmm5                  ;min value (for clamping)
     93 
     94     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     95     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     96     movsxd      rcx, DWORD PTR arg(4)       ;output_height
     97 %endm
     98 
     99 %macro HIGH_APPLY_FILTER_8 1
    100     movdqa      xmm6, xmm0
    101     punpckhwd   xmm6, xmm1
    102     punpcklwd   xmm0, xmm1
    103     pmaddwd     xmm6, xmm7
    104     pmaddwd     xmm0, xmm7
    105 
    106     paddd       xmm6, xmm4                  ;rounding
    107     paddd       xmm0, xmm4                  ;rounding
    108     psrad       xmm6, 7                     ;shift
    109     psrad       xmm0, 7                     ;shift
    110     packssdw    xmm0, xmm6                  ;pack back to word
    111 
    112     ;clamp the values
    113     pminsw      xmm0, xmm8
    114     pmaxsw      xmm0, xmm5
    115 
    116 %if %1
    117     movdqu      xmm1, [rdi]
    118     pavgw       xmm0, xmm1
    119 %endif
    120     movdqu      [rdi], xmm0                 ;store the result
    121 
    122     lea         rsi, [rsi + 2*rax]
    123     lea         rdi, [rdi + 2*rdx]
    124     dec         rcx
    125 %endm
    126 
    127 %macro HIGH_APPLY_FILTER_16 1
    128     movdqa      xmm9, xmm0
    129     movdqa      xmm6, xmm2
    130     punpckhwd   xmm9, xmm1
    131     punpckhwd   xmm6, xmm3
    132     punpcklwd   xmm0, xmm1
    133     punpcklwd   xmm2, xmm3
    134 
    135     pmaddwd     xmm9, xmm7
    136     pmaddwd     xmm6, xmm7
    137     pmaddwd     xmm0, xmm7
    138     pmaddwd     xmm2, xmm7
    139 
    140     paddd       xmm9, xmm4                  ;rounding
    141     paddd       xmm6, xmm4
    142     paddd       xmm0, xmm4
    143     paddd       xmm2, xmm4
    144 
    145     psrad       xmm9, 7                     ;shift
    146     psrad       xmm6, 7
    147     psrad       xmm0, 7
    148     psrad       xmm2, 7
    149 
    150     packssdw    xmm0, xmm9                  ;pack back to word
    151     packssdw    xmm2, xmm6                  ;pack back to word
    152 
    153     ;clamp the values
    154     pminsw      xmm0, xmm8
    155     pmaxsw      xmm0, xmm5
    156     pminsw      xmm2, xmm8
    157     pmaxsw      xmm2, xmm5
    158 
    159 %if %1
    160     movdqu      xmm1, [rdi]
    161     movdqu      xmm3, [rdi + 16]
    162     pavgw       xmm0, xmm1
    163     pavgw       xmm2, xmm3
    164 %endif
    165     movdqu      [rdi], xmm0               ;store the result
    166     movdqu      [rdi + 16], xmm2          ;store the result
    167 
    168     lea         rsi, [rsi + 2*rax]
    169     lea         rdi, [rdi + 2*rdx]
    170     dec         rcx
    171 %endm
    172 %endif
    173 
    174 SECTION .text
    175 
    176 global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
    177 sym(vpx_highbd_filter_block1d4_v2_sse2):
    178     push        rbp
    179     mov         rbp, rsp
    180     SHADOW_ARGS_TO_STACK 7
    181     push        rsi
    182     push        rdi
    183     ; end prolog
    184 
    185     HIGH_GET_PARAM_4
    186 .loop:
    187     movq        xmm0, [rsi]                 ;load src
    188     movq        xmm1, [rsi + 2*rax]
    189 
    190     HIGH_APPLY_FILTER_4 0
    191     jnz         .loop
    192 
    193     ; begin epilog
    194     pop         rdi
    195     pop         rsi
    196     UNSHADOW_ARGS
    197     pop         rbp
    198     ret
    199 
    200 %if ARCH_X86_64
    201 global sym(vpx_highbd_filter_block1d8_v2_sse2) PRIVATE
    202 sym(vpx_highbd_filter_block1d8_v2_sse2):
    203     push        rbp
    204     mov         rbp, rsp
    205     SHADOW_ARGS_TO_STACK 7
    206     SAVE_XMM 8
    207     push        rsi
    208     push        rdi
    209     ; end prolog
    210 
    211     HIGH_GET_PARAM
    212 .loop:
    213     movdqu      xmm0, [rsi]                 ;0
    214     movdqu      xmm1, [rsi + 2*rax]         ;1
    215 
    216     HIGH_APPLY_FILTER_8 0
    217     jnz         .loop
    218 
    219     ; begin epilog
    220     pop         rdi
    221     pop         rsi
    222     RESTORE_XMM
    223     UNSHADOW_ARGS
    224     pop         rbp
    225     ret
    226 
    227 global sym(vpx_highbd_filter_block1d16_v2_sse2) PRIVATE
    228 sym(vpx_highbd_filter_block1d16_v2_sse2):
    229     push        rbp
    230     mov         rbp, rsp
    231     SHADOW_ARGS_TO_STACK 7
    232     SAVE_XMM 9
    233     push        rsi
    234     push        rdi
    235     ; end prolog
    236 
    237     HIGH_GET_PARAM
    238 .loop:
    239     movdqu        xmm0, [rsi]               ;0
    240     movdqu        xmm2, [rsi + 16]
    241     movdqu        xmm1, [rsi + 2*rax]       ;1
    242     movdqu        xmm3, [rsi + 2*rax + 16]
    243 
    244     HIGH_APPLY_FILTER_16 0
    245     jnz         .loop
    246 
    247     ; begin epilog
    248     pop         rdi
    249     pop         rsi
    250     RESTORE_XMM
    251     UNSHADOW_ARGS
    252     pop         rbp
    253     ret
    254 %endif
    255 
    256 global sym(vpx_highbd_filter_block1d4_v2_avg_sse2) PRIVATE
    257 sym(vpx_highbd_filter_block1d4_v2_avg_sse2):
    258     push        rbp
    259     mov         rbp, rsp
    260     SHADOW_ARGS_TO_STACK 7
    261     push        rsi
    262     push        rdi
    263     ; end prolog
    264 
    265     HIGH_GET_PARAM_4
    266 .loop:
    267     movq        xmm0, [rsi]                 ;load src
    268     movq        xmm1, [rsi + 2*rax]
    269 
    270     HIGH_APPLY_FILTER_4 1
    271     jnz         .loop
    272 
    273     ; begin epilog
    274     pop         rdi
    275     pop         rsi
    276     UNSHADOW_ARGS
    277     pop         rbp
    278     ret
    279 
    280 %if ARCH_X86_64
    281 global sym(vpx_highbd_filter_block1d8_v2_avg_sse2) PRIVATE
    282 sym(vpx_highbd_filter_block1d8_v2_avg_sse2):
    283     push        rbp
    284     mov         rbp, rsp
    285     SHADOW_ARGS_TO_STACK 7
    286     SAVE_XMM 8
    287     push        rsi
    288     push        rdi
    289     ; end prolog
    290 
    291     HIGH_GET_PARAM
    292 .loop:
    293     movdqu      xmm0, [rsi]                 ;0
    294     movdqu      xmm1, [rsi + 2*rax]         ;1
    295 
    296     HIGH_APPLY_FILTER_8 1
    297     jnz         .loop
    298 
    299     ; begin epilog
    300     pop         rdi
    301     pop         rsi
    302     RESTORE_XMM
    303     UNSHADOW_ARGS
    304     pop         rbp
    305     ret
    306 
    307 global sym(vpx_highbd_filter_block1d16_v2_avg_sse2) PRIVATE
    308 sym(vpx_highbd_filter_block1d16_v2_avg_sse2):
    309     push        rbp
    310     mov         rbp, rsp
    311     SHADOW_ARGS_TO_STACK 7
    312     SAVE_XMM 9
    313     push        rsi
    314     push        rdi
    315     ; end prolog
    316 
    317     HIGH_GET_PARAM
    318 .loop:
    319     movdqu        xmm0, [rsi]               ;0
    320     movdqu        xmm1, [rsi + 2*rax]       ;1
    321     movdqu        xmm2, [rsi + 16]
    322     movdqu        xmm3, [rsi + 2*rax + 16]
    323 
    324     HIGH_APPLY_FILTER_16 1
    325     jnz         .loop
    326 
    327     ; begin epilog
    328     pop         rdi
    329     pop         rsi
    330     RESTORE_XMM
    331     UNSHADOW_ARGS
    332     pop         rbp
    333     ret
    334 %endif
    335 
    336 global sym(vpx_highbd_filter_block1d4_h2_sse2) PRIVATE
    337 sym(vpx_highbd_filter_block1d4_h2_sse2):
    338     push        rbp
    339     mov         rbp, rsp
    340     SHADOW_ARGS_TO_STACK 7
    341     push        rsi
    342     push        rdi
    343     ; end prolog
    344 
    345     HIGH_GET_PARAM_4
    346 .loop:
    347     movdqu      xmm0, [rsi]                 ;load src
    348     movdqa      xmm1, xmm0
    349     psrldq      xmm1, 2
    350 
    351     HIGH_APPLY_FILTER_4 0
    352     jnz         .loop
    353 
    354     ; begin epilog
    355     pop         rdi
    356     pop         rsi
    357     UNSHADOW_ARGS
    358     pop         rbp
    359     ret
    360 
    361 %if ARCH_X86_64
    362 global sym(vpx_highbd_filter_block1d8_h2_sse2) PRIVATE
    363 sym(vpx_highbd_filter_block1d8_h2_sse2):
    364     push        rbp
    365     mov         rbp, rsp
    366     SHADOW_ARGS_TO_STACK 7
    367     SAVE_XMM 8
    368     push        rsi
    369     push        rdi
    370     ; end prolog
    371 
    372     HIGH_GET_PARAM
    373 .loop:
    374     movdqu      xmm0, [rsi]                 ;load src
    375     movdqu      xmm1, [rsi + 2]
    376 
    377     HIGH_APPLY_FILTER_8 0
    378     jnz         .loop
    379 
    380     ; begin epilog
    381     pop         rdi
    382     pop         rsi
    383     RESTORE_XMM
    384     UNSHADOW_ARGS
    385     pop         rbp
    386     ret
    387 
    388 global sym(vpx_highbd_filter_block1d16_h2_sse2) PRIVATE
    389 sym(vpx_highbd_filter_block1d16_h2_sse2):
    390     push        rbp
    391     mov         rbp, rsp
    392     SHADOW_ARGS_TO_STACK 7
    393     SAVE_XMM 9
    394     push        rsi
    395     push        rdi
    396     ; end prolog
    397 
    398     HIGH_GET_PARAM
    399 .loop:
    400     movdqu      xmm0,   [rsi]               ;load src
    401     movdqu      xmm1,   [rsi + 2]
    402     movdqu      xmm2,   [rsi + 16]
    403     movdqu      xmm3,   [rsi + 18]
    404 
    405     HIGH_APPLY_FILTER_16 0
    406     jnz         .loop
    407 
    408     ; begin epilog
    409     pop         rdi
    410     pop         rsi
    411     RESTORE_XMM
    412     UNSHADOW_ARGS
    413     pop         rbp
    414     ret
    415 %endif
    416 
    417 global sym(vpx_highbd_filter_block1d4_h2_avg_sse2) PRIVATE
    418 sym(vpx_highbd_filter_block1d4_h2_avg_sse2):
    419     push        rbp
    420     mov         rbp, rsp
    421     SHADOW_ARGS_TO_STACK 7
    422     push        rsi
    423     push        rdi
    424     ; end prolog
    425 
    426     HIGH_GET_PARAM_4
    427 .loop:
    428     movdqu      xmm0, [rsi]                 ;load src
    429     movdqa      xmm1, xmm0
    430     psrldq      xmm1, 2
    431 
    432     HIGH_APPLY_FILTER_4 1
    433     jnz         .loop
    434 
    435     ; begin epilog
    436     pop         rdi
    437     pop         rsi
    438     UNSHADOW_ARGS
    439     pop         rbp
    440     ret
    441 
    442 %if ARCH_X86_64
    443 global sym(vpx_highbd_filter_block1d8_h2_avg_sse2) PRIVATE
    444 sym(vpx_highbd_filter_block1d8_h2_avg_sse2):
    445     push        rbp
    446     mov         rbp, rsp
    447     SHADOW_ARGS_TO_STACK 7
    448     SAVE_XMM 8
    449     push        rsi
    450     push        rdi
    451     ; end prolog
    452 
    453     HIGH_GET_PARAM
    454 .loop:
    455     movdqu      xmm0, [rsi]                 ;load src
    456     movdqu      xmm1, [rsi + 2]
    457 
    458     HIGH_APPLY_FILTER_8 1
    459     jnz         .loop
    460 
    461     ; begin epilog
    462     pop         rdi
    463     pop         rsi
    464     RESTORE_XMM
    465     UNSHADOW_ARGS
    466     pop         rbp
    467     ret
    468 
    469 global sym(vpx_highbd_filter_block1d16_h2_avg_sse2) PRIVATE
    470 sym(vpx_highbd_filter_block1d16_h2_avg_sse2):
    471     push        rbp
    472     mov         rbp, rsp
    473     SHADOW_ARGS_TO_STACK 7
    474     SAVE_XMM 9
    475     push        rsi
    476     push        rdi
    477     ; end prolog
    478 
    479     HIGH_GET_PARAM
    480 .loop:
    481     movdqu      xmm0,   [rsi]               ;load src
    482     movdqu      xmm1,   [rsi + 2]
    483     movdqu      xmm2,   [rsi + 16]
    484     movdqu      xmm3,   [rsi + 18]
    485 
    486     HIGH_APPLY_FILTER_16 1
    487     jnz         .loop
    488 
    489     ; begin epilog
    490     pop         rdi
    491     pop         rsi
    492     RESTORE_XMM
    493     UNSHADOW_ARGS
    494     pop         rbp
    495     ret
    496 %endif
    497