Home | History | Annotate | Download | only in x86
      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 
     15 %include "aom_ports/x86_abi_support.asm"
     16 
     17 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
     18 ;overflow.
     19 
     20 %macro GET_FILTERS_4 0
     21     mov         rdx, arg(5)                 ;filter ptr
     22     mov         rcx, 0x0400040
     23 
     24     movdqa      xmm7, [rdx]                 ;load filters
     25     pshuflw     xmm0, xmm7, 0b              ;k0
     26     pshuflw     xmm1, xmm7, 01010101b       ;k1
     27     pshuflw     xmm2, xmm7, 10101010b       ;k2
     28     pshuflw     xmm3, xmm7, 11111111b       ;k3
     29     psrldq      xmm7, 8
     30     pshuflw     xmm4, xmm7, 0b              ;k4
     31     pshuflw     xmm5, xmm7, 01010101b       ;k5
     32     pshuflw     xmm6, xmm7, 10101010b       ;k6
     33     pshuflw     xmm7, xmm7, 11111111b       ;k7
     34 
     35     punpcklqdq  xmm0, xmm1
     36     punpcklqdq  xmm2, xmm3
     37     punpcklqdq  xmm5, xmm4
     38     punpcklqdq  xmm6, xmm7
     39 
     40     movdqa      k0k1, xmm0
     41     movdqa      k2k3, xmm2
     42     movdqa      k5k4, xmm5
     43     movdqa      k6k7, xmm6
     44 
     45     movq        xmm6, rcx
     46     pshufd      xmm6, xmm6, 0
     47     movdqa      krd, xmm6
     48 
     49     pxor        xmm7, xmm7
     50     movdqa      zero, xmm7
     51 %endm
     52 
     53 %macro APPLY_FILTER_4 1
     54     punpckldq   xmm0, xmm1                  ;two row in one register
     55     punpckldq   xmm6, xmm7
     56     punpckldq   xmm2, xmm3
     57     punpckldq   xmm5, xmm4
     58 
     59     punpcklbw   xmm0, zero                  ;unpack to word
     60     punpcklbw   xmm6, zero
     61     punpcklbw   xmm2, zero
     62     punpcklbw   xmm5, zero
     63 
     64     pmullw      xmm0, k0k1                  ;multiply the filter factors
     65     pmullw      xmm6, k6k7
     66     pmullw      xmm2, k2k3
     67     pmullw      xmm5, k5k4
     68 
     69     paddsw      xmm0, xmm6                  ;sum
     70     movdqa      xmm1, xmm0
     71     psrldq      xmm1, 8
     72     paddsw      xmm0, xmm1
     73     paddsw      xmm0, xmm2
     74     psrldq      xmm2, 8
     75     paddsw      xmm0, xmm5
     76     psrldq      xmm5, 8
     77     paddsw      xmm0, xmm2
     78     paddsw      xmm0, xmm5
     79 
     80     paddsw      xmm0, krd                   ;rounding
     81     psraw       xmm0, 7                     ;shift
     82     packuswb    xmm0, xmm0                  ;pack to byte
     83 
     84 %if %1
     85     movd        xmm1, [rdi]
     86     pavgb       xmm0, xmm1
     87 %endif
     88     movd        [rdi], xmm0
     89 %endm
     90 
     91 %macro GET_FILTERS 0
     92     mov         rdx, arg(5)                 ;filter ptr
     93     mov         rsi, arg(0)                 ;src_ptr
     94     mov         rdi, arg(2)                 ;output_ptr
     95     mov         rcx, 0x0400040
     96 
     97     movdqa      xmm7, [rdx]                 ;load filters
     98     pshuflw     xmm0, xmm7, 0b              ;k0
     99     pshuflw     xmm1, xmm7, 01010101b       ;k1
    100     pshuflw     xmm2, xmm7, 10101010b       ;k2
    101     pshuflw     xmm3, xmm7, 11111111b       ;k3
    102     pshufhw     xmm4, xmm7, 0b              ;k4
    103     pshufhw     xmm5, xmm7, 01010101b       ;k5
    104     pshufhw     xmm6, xmm7, 10101010b       ;k6
    105     pshufhw     xmm7, xmm7, 11111111b       ;k7
    106 
    107     punpcklwd   xmm0, xmm0
    108     punpcklwd   xmm1, xmm1
    109     punpcklwd   xmm2, xmm2
    110     punpcklwd   xmm3, xmm3
    111     punpckhwd   xmm4, xmm4
    112     punpckhwd   xmm5, xmm5
    113     punpckhwd   xmm6, xmm6
    114     punpckhwd   xmm7, xmm7
    115 
    116     movdqa      k0,   xmm0                  ;store filter factors on stack
    117     movdqa      k1,   xmm1
    118     movdqa      k2,   xmm2
    119     movdqa      k3,   xmm3
    120     movdqa      k4,   xmm4
    121     movdqa      k5,   xmm5
    122     movdqa      k6,   xmm6
    123     movdqa      k7,   xmm7
    124 
    125     movq        xmm6, rcx
    126     pshufd      xmm6, xmm6, 0
    127     movdqa      krd, xmm6                   ;rounding
    128 
    129     pxor        xmm7, xmm7
    130     movdqa      zero, xmm7
    131 %endm
    132 
    133 %macro LOAD_VERT_8 1
    134     movq        xmm0, [rsi + %1]            ;0
    135     movq        xmm1, [rsi + rax + %1]      ;1
    136     movq        xmm6, [rsi + rdx * 2 + %1]  ;6
    137     lea         rsi,  [rsi + rax]
    138     movq        xmm7, [rsi + rdx * 2 + %1]  ;7
    139     movq        xmm2, [rsi + rax + %1]      ;2
    140     movq        xmm3, [rsi + rax * 2 + %1]  ;3
    141     movq        xmm4, [rsi + rdx + %1]      ;4
    142     movq        xmm5, [rsi + rax * 4 + %1]  ;5
    143 %endm
    144 
    145 %macro APPLY_FILTER_8 2
    146     punpcklbw   xmm0, zero
    147     punpcklbw   xmm1, zero
    148     punpcklbw   xmm6, zero
    149     punpcklbw   xmm7, zero
    150     punpcklbw   xmm2, zero
    151     punpcklbw   xmm5, zero
    152     punpcklbw   xmm3, zero
    153     punpcklbw   xmm4, zero
    154 
    155     pmullw      xmm0, k0
    156     pmullw      xmm1, k1
    157     pmullw      xmm6, k6
    158     pmullw      xmm7, k7
    159     pmullw      xmm2, k2
    160     pmullw      xmm5, k5
    161     pmullw      xmm3, k3
    162     pmullw      xmm4, k4
    163 
    164     paddsw      xmm0, xmm1
    165     paddsw      xmm0, xmm6
    166     paddsw      xmm0, xmm7
    167     paddsw      xmm0, xmm2
    168     paddsw      xmm0, xmm5
    169     paddsw      xmm0, xmm3
    170     paddsw      xmm0, xmm4
    171 
    172     paddsw      xmm0, krd                   ;rounding
    173     psraw       xmm0, 7                     ;shift
    174     packuswb    xmm0, xmm0                  ;pack back to byte
    175 %if %1
    176     movq        xmm1, [rdi + %2]
    177     pavgb       xmm0, xmm1
    178 %endif
    179     movq        [rdi + %2], xmm0
    180 %endm
    181 
    182 SECTION .text
    183 
    184 ;void aom_filter_block1d4_v8_sse2
    185 ;(
    186 ;    unsigned char *src_ptr,
    187 ;    unsigned int   src_pitch,
    188 ;    unsigned char *output_ptr,
    189 ;    unsigned int   out_pitch,
    190 ;    unsigned int   output_height,
    191 ;    short *filter
    192 ;)
    193 global sym(aom_filter_block1d4_v8_sse2) PRIVATE
    194 sym(aom_filter_block1d4_v8_sse2):
    195     push        rbp
    196     mov         rbp, rsp
    197     SHADOW_ARGS_TO_STACK 6
    198     SAVE_XMM 7
    199     push        rsi
    200     push        rdi
    201     push        rbx
    202     ; end prolog
    203 
    204     ALIGN_STACK 16, rax
    205     sub         rsp, 16 * 6
    206     %define k0k1 [rsp + 16 * 0]
    207     %define k2k3 [rsp + 16 * 1]
    208     %define k5k4 [rsp + 16 * 2]
    209     %define k6k7 [rsp + 16 * 3]
    210     %define krd [rsp + 16 * 4]
    211     %define zero [rsp + 16 * 5]
    212 
    213     GET_FILTERS_4
    214 
    215     mov         rsi, arg(0)                 ;src_ptr
    216     mov         rdi, arg(2)                 ;output_ptr
    217 
    218     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    219     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    220     lea         rdx, [rax + rax * 2]
    221     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    222 
    223 .loop:
    224     movd        xmm0, [rsi]                 ;load src: row 0
    225     movd        xmm1, [rsi + rax]           ;1
    226     movd        xmm6, [rsi + rdx * 2]       ;6
    227     lea         rsi,  [rsi + rax]
    228     movd        xmm7, [rsi + rdx * 2]       ;7
    229     movd        xmm2, [rsi + rax]           ;2
    230     movd        xmm3, [rsi + rax * 2]       ;3
    231     movd        xmm4, [rsi + rdx]           ;4
    232     movd        xmm5, [rsi + rax * 4]       ;5
    233 
    234     APPLY_FILTER_4 0
    235 
    236     lea         rdi, [rdi + rbx]
    237     dec         rcx
    238     jnz         .loop
    239 
    240     add rsp, 16 * 6
    241     pop rsp
    242     pop rbx
    243     ; begin epilog
    244     pop rdi
    245     pop rsi
    246     RESTORE_XMM
    247     UNSHADOW_ARGS
    248     pop         rbp
    249     ret
    250 
    251 ;void aom_filter_block1d8_v8_sse2
    252 ;(
    253 ;    unsigned char *src_ptr,
    254 ;    unsigned int   src_pitch,
    255 ;    unsigned char *output_ptr,
    256 ;    unsigned int   out_pitch,
    257 ;    unsigned int   output_height,
    258 ;    short *filter
    259 ;)
    260 global sym(aom_filter_block1d8_v8_sse2) PRIVATE
    261 sym(aom_filter_block1d8_v8_sse2):
    262     push        rbp
    263     mov         rbp, rsp
    264     SHADOW_ARGS_TO_STACK 6
    265     SAVE_XMM 7
    266     push        rsi
    267     push        rdi
    268     push        rbx
    269     ; end prolog
    270 
    271     ALIGN_STACK 16, rax
    272     sub         rsp, 16 * 10
    273     %define k0 [rsp + 16 * 0]
    274     %define k1 [rsp + 16 * 1]
    275     %define k2 [rsp + 16 * 2]
    276     %define k3 [rsp + 16 * 3]
    277     %define k4 [rsp + 16 * 4]
    278     %define k5 [rsp + 16 * 5]
    279     %define k6 [rsp + 16 * 6]
    280     %define k7 [rsp + 16 * 7]
    281     %define krd [rsp + 16 * 8]
    282     %define zero [rsp + 16 * 9]
    283 
    284     GET_FILTERS
    285 
    286     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    287     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    288     lea         rdx, [rax + rax * 2]
    289     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    290 
    291 .loop:
    292     LOAD_VERT_8 0
    293     APPLY_FILTER_8 0, 0
    294 
    295     lea         rdi, [rdi + rbx]
    296     dec         rcx
    297     jnz         .loop
    298 
    299     add rsp, 16 * 10
    300     pop rsp
    301     pop rbx
    302     ; begin epilog
    303     pop rdi
    304     pop rsi
    305     RESTORE_XMM
    306     UNSHADOW_ARGS
    307     pop         rbp
    308     ret
    309 
    310 ;void aom_filter_block1d16_v8_sse2
    311 ;(
    312 ;    unsigned char *src_ptr,
    313 ;    unsigned int   src_pitch,
    314 ;    unsigned char *output_ptr,
    315 ;    unsigned int   out_pitch,
    316 ;    unsigned int   output_height,
    317 ;    short *filter
    318 ;)
    319 global sym(aom_filter_block1d16_v8_sse2) PRIVATE
    320 sym(aom_filter_block1d16_v8_sse2):
    321     push        rbp
    322     mov         rbp, rsp
    323     SHADOW_ARGS_TO_STACK 6
    324     SAVE_XMM 7
    325     push        rsi
    326     push        rdi
    327     push        rbx
    328     ; end prolog
    329 
    330     ALIGN_STACK 16, rax
    331     sub         rsp, 16 * 10
    332     %define k0 [rsp + 16 * 0]
    333     %define k1 [rsp + 16 * 1]
    334     %define k2 [rsp + 16 * 2]
    335     %define k3 [rsp + 16 * 3]
    336     %define k4 [rsp + 16 * 4]
    337     %define k5 [rsp + 16 * 5]
    338     %define k6 [rsp + 16 * 6]
    339     %define k7 [rsp + 16 * 7]
    340     %define krd [rsp + 16 * 8]
    341     %define zero [rsp + 16 * 9]
    342 
    343     GET_FILTERS
    344 
    345     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    346     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
    347     lea         rdx, [rax + rax * 2]
    348     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    349 
    350 .loop:
    351     LOAD_VERT_8 0
    352     APPLY_FILTER_8 0, 0
    353     sub         rsi, rax
    354 
    355     LOAD_VERT_8 8
    356     APPLY_FILTER_8 0, 8
    357     add         rdi, rbx
    358 
    359     dec         rcx
    360     jnz         .loop
    361 
    362     add rsp, 16 * 10
    363     pop rsp
    364     pop rbx
    365     ; begin epilog
    366     pop rdi
    367     pop rsi
    368     RESTORE_XMM
    369     UNSHADOW_ARGS
    370     pop         rbp
    371     ret
    372 
    373 ;void aom_filter_block1d4_h8_sse2
    374 ;(
    375 ;    unsigned char  *src_ptr,
    376 ;    unsigned int    src_pixels_per_line,
    377 ;    unsigned char  *output_ptr,
    378 ;    unsigned int    output_pitch,
    379 ;    unsigned int    output_height,
    380 ;    short *filter
    381 ;)
    382 global sym(aom_filter_block1d4_h8_sse2) PRIVATE
    383 sym(aom_filter_block1d4_h8_sse2):
    384     push        rbp
    385     mov         rbp, rsp
    386     SHADOW_ARGS_TO_STACK 6
    387     SAVE_XMM 7
    388     push        rsi
    389     push        rdi
    390     ; end prolog
    391 
    392     ALIGN_STACK 16, rax
    393     sub         rsp, 16 * 6
    394     %define k0k1 [rsp + 16 * 0]
    395     %define k2k3 [rsp + 16 * 1]
    396     %define k5k4 [rsp + 16 * 2]
    397     %define k6k7 [rsp + 16 * 3]
    398     %define krd [rsp + 16 * 4]
    399     %define zero [rsp + 16 * 5]
    400 
    401     GET_FILTERS_4
    402 
    403     mov         rsi, arg(0)                 ;src_ptr
    404     mov         rdi, arg(2)                 ;output_ptr
    405 
    406     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    407     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    408     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    409 
    410 .loop:
    411     movdqu      xmm0,   [rsi - 3]           ;load src
    412 
    413     movdqa      xmm1, xmm0
    414     movdqa      xmm6, xmm0
    415     movdqa      xmm7, xmm0
    416     movdqa      xmm2, xmm0
    417     movdqa      xmm3, xmm0
    418     movdqa      xmm5, xmm0
    419     movdqa      xmm4, xmm0
    420 
    421     psrldq      xmm1, 1
    422     psrldq      xmm6, 6
    423     psrldq      xmm7, 7
    424     psrldq      xmm2, 2
    425     psrldq      xmm3, 3
    426     psrldq      xmm5, 5
    427     psrldq      xmm4, 4
    428 
    429     APPLY_FILTER_4 0
    430 
    431     lea         rsi, [rsi + rax]
    432     lea         rdi, [rdi + rdx]
    433     dec         rcx
    434     jnz         .loop
    435 
    436     add rsp, 16 * 6
    437     pop rsp
    438 
    439     ; begin epilog
    440     pop rdi
    441     pop rsi
    442     RESTORE_XMM
    443     UNSHADOW_ARGS
    444     pop         rbp
    445     ret
    446 
    447 ;void aom_filter_block1d8_h8_sse2
    448 ;(
    449 ;    unsigned char  *src_ptr,
    450 ;    unsigned int    src_pixels_per_line,
    451 ;    unsigned char  *output_ptr,
    452 ;    unsigned int    output_pitch,
    453 ;    unsigned int    output_height,
    454 ;    short *filter
    455 ;)
    456 global sym(aom_filter_block1d8_h8_sse2) PRIVATE
    457 sym(aom_filter_block1d8_h8_sse2):
    458     push        rbp
    459     mov         rbp, rsp
    460     SHADOW_ARGS_TO_STACK 6
    461     SAVE_XMM 7
    462     push        rsi
    463     push        rdi
    464     ; end prolog
    465 
    466     ALIGN_STACK 16, rax
    467     sub         rsp, 16 * 10
    468     %define k0 [rsp + 16 * 0]
    469     %define k1 [rsp + 16 * 1]
    470     %define k2 [rsp + 16 * 2]
    471     %define k3 [rsp + 16 * 3]
    472     %define k4 [rsp + 16 * 4]
    473     %define k5 [rsp + 16 * 5]
    474     %define k6 [rsp + 16 * 6]
    475     %define k7 [rsp + 16 * 7]
    476     %define krd [rsp + 16 * 8]
    477     %define zero [rsp + 16 * 9]
    478 
    479     GET_FILTERS
    480 
    481     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    482     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    483     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    484 
    485 .loop:
    486     movdqu      xmm0,   [rsi - 3]           ;load src
    487 
    488     movdqa      xmm1, xmm0
    489     movdqa      xmm6, xmm0
    490     movdqa      xmm7, xmm0
    491     movdqa      xmm2, xmm0
    492     movdqa      xmm5, xmm0
    493     movdqa      xmm3, xmm0
    494     movdqa      xmm4, xmm0
    495 
    496     psrldq      xmm1, 1
    497     psrldq      xmm6, 6
    498     psrldq      xmm7, 7
    499     psrldq      xmm2, 2
    500     psrldq      xmm5, 5
    501     psrldq      xmm3, 3
    502     psrldq      xmm4, 4
    503 
    504     APPLY_FILTER_8 0, 0
    505 
    506     lea         rsi, [rsi + rax]
    507     lea         rdi, [rdi + rdx]
    508     dec         rcx
    509     jnz         .loop
    510 
    511     add rsp, 16 * 10
    512     pop rsp
    513 
    514     ; begin epilog
    515     pop rdi
    516     pop rsi
    517     RESTORE_XMM
    518     UNSHADOW_ARGS
    519     pop         rbp
    520     ret
    521 
    522 ;void aom_filter_block1d16_h8_sse2
    523 ;(
    524 ;    unsigned char  *src_ptr,
    525 ;    unsigned int    src_pixels_per_line,
    526 ;    unsigned char  *output_ptr,
    527 ;    unsigned int    output_pitch,
    528 ;    unsigned int    output_height,
    529 ;    short *filter
    530 ;)
    531 global sym(aom_filter_block1d16_h8_sse2) PRIVATE
    532 sym(aom_filter_block1d16_h8_sse2):
    533     push        rbp
    534     mov         rbp, rsp
    535     SHADOW_ARGS_TO_STACK 6
    536     SAVE_XMM 7
    537     push        rsi
    538     push        rdi
    539     ; end prolog
    540 
    541     ALIGN_STACK 16, rax
    542     sub         rsp, 16 * 10
    543     %define k0 [rsp + 16 * 0]
    544     %define k1 [rsp + 16 * 1]
    545     %define k2 [rsp + 16 * 2]
    546     %define k3 [rsp + 16 * 3]
    547     %define k4 [rsp + 16 * 4]
    548     %define k5 [rsp + 16 * 5]
    549     %define k6 [rsp + 16 * 6]
    550     %define k7 [rsp + 16 * 7]
    551     %define krd [rsp + 16 * 8]
    552     %define zero [rsp + 16 * 9]
    553 
    554     GET_FILTERS
    555 
    556     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
    557     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
    558     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    559 
    560 .loop:
    561     movdqu      xmm0,   [rsi - 3]           ;load src
    562 
    563     movdqa      xmm1, xmm0
    564     movdqa      xmm6, xmm0
    565     movdqa      xmm7, xmm0
    566     movdqa      xmm2, xmm0
    567     movdqa      xmm5, xmm0
    568     movdqa      xmm3, xmm0
    569     movdqa      xmm4, xmm0
    570 
    571     psrldq      xmm1, 1
    572     psrldq      xmm6, 6
    573     psrldq      xmm7, 7
    574     psrldq      xmm2, 2
    575     psrldq      xmm5, 5
    576     psrldq      xmm3, 3
    577     psrldq      xmm4, 4
    578 
    579     APPLY_FILTER_8 0, 0
    580 
    581     movdqu      xmm0,   [rsi + 5]           ;load src
    582 
    583     movdqa      xmm1, xmm0
    584     movdqa      xmm6, xmm0
    585     movdqa      xmm7, xmm0
    586     movdqa      xmm2, xmm0
    587     movdqa      xmm5, xmm0
    588     movdqa      xmm3, xmm0
    589     movdqa      xmm4, xmm0
    590 
    591     psrldq      xmm1, 1
    592     psrldq      xmm6, 6
    593     psrldq      xmm7, 7
    594     psrldq      xmm2, 2
    595     psrldq      xmm5, 5
    596     psrldq      xmm3, 3
    597     psrldq      xmm4, 4
    598 
    599     APPLY_FILTER_8 0, 8
    600 
    601     lea         rsi, [rsi + rax]
    602     lea         rdi, [rdi + rdx]
    603     dec         rcx
    604     jnz         .loop
    605 
    606     add rsp, 16 * 10
    607     pop rsp
    608 
    609     ; begin epilog
    610     pop rdi
    611     pop rsi
    612     RESTORE_XMM
    613     UNSHADOW_ARGS
    614     pop         rbp
    615     ret
    616