Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define xmm_filter_shift            7
     15 
     16 
     17 ;void vp8_filter_block2d_bil_var_ssse3
     18 ;(
     19 ;    unsigned char *ref_ptr,
     20 ;    int ref_pixels_per_line,
     21 ;    unsigned char *src_ptr,
     22 ;    int src_pixels_per_line,
     23 ;    unsigned int Height,
     24 ;    int  xoffset,
     25 ;    int  yoffset,
     26 ;    int *sum,
     27 ;    unsigned int *sumsquared;;
     28 ;
     29 ;)
     30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
     31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
     32 global sym(vp8_filter_block2d_bil_var_ssse3)
     33 sym(vp8_filter_block2d_bil_var_ssse3):
     34     push        rbp
     35     mov         rbp, rsp
     36     SHADOW_ARGS_TO_STACK 9
     37     SAVE_XMM
     38     GET_GOT     rbx
     39     push rsi
     40     push rdi
     41     ; end prolog
     42 
     43         pxor            xmm6,           xmm6
     44         pxor            xmm7,           xmm7
     45 
     46         lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
     47         movsxd          rax,            dword ptr arg(5)     ; xoffset
     48 
     49         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
     50         je              filter_block2d_bil_var_ssse3_sp_only
     51 
     52         shl             rax,            4                    ; point to filter coeff with xoffset
     53         lea             rax,            [rax + rcx]          ; HFilter
     54 
     55         movsxd          rdx,            dword ptr arg(6)     ; yoffset
     56 
     57         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
     58         je              filter_block2d_bil_var_ssse3_fp_only
     59 
     60         shl             rdx,            4
     61         lea             rdx,            [rdx + rcx]          ; VFilter
     62 
     63         mov             rsi,            arg(0)               ;ref_ptr
     64         mov             rdi,            arg(2)               ;src_ptr
     65         movsxd          rcx,            dword ptr arg(4)     ;Height
     66 
     67         movdqu          xmm0,           XMMWORD PTR [rsi]
     68         movdqu          xmm1,           XMMWORD PTR [rsi+1]
     69         movdqa          xmm2,           xmm0
     70 
     71         punpcklbw       xmm0,           xmm1
     72         punpckhbw       xmm2,           xmm1
     73         pmaddubsw       xmm0,           [rax]
     74         pmaddubsw       xmm2,           [rax]
     75 
     76         paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
     77         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
     78         psraw           xmm0,           xmm_filter_shift
     79         psraw           xmm2,           xmm_filter_shift
     80 
     81         packuswb        xmm0,           xmm2
     82 
     83 %if ABI_IS_32BIT
     84         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
     85 %else
     86         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
     87         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
     88         lea             rsi,            [rsi + r8]
     89 %endif
     90 
     91 filter_block2d_bil_var_ssse3_loop:
     92         movdqu          xmm1,           XMMWORD PTR [rsi]
     93         movdqu          xmm2,           XMMWORD PTR [rsi+1]
     94         movdqa          xmm3,           xmm1
     95 
     96         punpcklbw       xmm1,           xmm2
     97         punpckhbw       xmm3,           xmm2
     98         pmaddubsw       xmm1,           [rax]
     99         pmaddubsw       xmm3,           [rax]
    100 
    101         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
    102         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
    103         psraw           xmm1,           xmm_filter_shift
    104         psraw           xmm3,           xmm_filter_shift
    105         packuswb        xmm1,           xmm3
    106 
    107         movdqa          xmm2,           xmm0
    108         movdqa          xmm0,           xmm1
    109         movdqa          xmm3,           xmm2
    110 
    111         punpcklbw       xmm2,           xmm1
    112         punpckhbw       xmm3,           xmm1
    113         pmaddubsw       xmm2,           [rdx]
    114         pmaddubsw       xmm3,           [rdx]
    115 
    116         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
    117         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
    118         psraw           xmm2,           xmm_filter_shift
    119         psraw           xmm3,           xmm_filter_shift
    120 
    121         movq            xmm1,           QWORD PTR [rdi]
    122         pxor            xmm4,           xmm4
    123         punpcklbw       xmm1,           xmm4
    124         movq            xmm5,           QWORD PTR [rdi+8]
    125         punpcklbw       xmm5,           xmm4
    126 
    127         psubw           xmm2,           xmm1
    128         psubw           xmm3,           xmm5
    129         paddw           xmm6,           xmm2
    130         paddw           xmm6,           xmm3
    131         pmaddwd         xmm2,           xmm2
    132         pmaddwd         xmm3,           xmm3
    133         paddd           xmm7,           xmm2
    134         paddd           xmm7,           xmm3
    135 
    136 %if ABI_IS_32BIT
    137         add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
    138         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
    139 %else
    140         lea             rsi,            [rsi + r8]
    141         lea             rdi,            [rdi + r9]
    142 %endif
    143 
    144         sub             rcx,            1
    145         jnz             filter_block2d_bil_var_ssse3_loop
    146 
    147         jmp             filter_block2d_bil_variance
    148 
    149 filter_block2d_bil_var_ssse3_sp_only:
    150         movsxd          rdx,            dword ptr arg(6)     ; yoffset
    151 
    152         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
    153         je              filter_block2d_bil_var_ssse3_full_pixel
    154 
    155         shl             rdx,            4
    156         lea             rdx,            [rdx + rcx]          ; VFilter
    157 
    158         mov             rsi,            arg(0)               ;ref_ptr
    159         mov             rdi,            arg(2)               ;src_ptr
    160         movsxd          rcx,            dword ptr arg(4)     ;Height
    161         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
    162 
    163         movdqu          xmm1,           XMMWORD PTR [rsi]
    164         movdqa          xmm0,           xmm1
    165 
    166 %if ABI_IS_32BIT=0
    167         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
    168 %endif
    169 
    170         lea             rsi,            [rsi + rax]
    171 
    172 filter_block2d_bil_sp_only_loop:
    173         movdqu          xmm3,           XMMWORD PTR [rsi]
    174         movdqa          xmm2,           xmm1
    175         movdqa          xmm0,           xmm3
    176 
    177         punpcklbw       xmm1,           xmm3
    178         punpckhbw       xmm2,           xmm3
    179         pmaddubsw       xmm1,           [rdx]
    180         pmaddubsw       xmm2,           [rdx]
    181 
    182         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
    183         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
    184         psraw           xmm1,           xmm_filter_shift
    185         psraw           xmm2,           xmm_filter_shift
    186 
    187         movq            xmm3,           QWORD PTR [rdi]
    188         pxor            xmm4,           xmm4
    189         punpcklbw       xmm3,           xmm4
    190         movq            xmm5,           QWORD PTR [rdi+8]
    191         punpcklbw       xmm5,           xmm4
    192 
    193         psubw           xmm1,           xmm3
    194         psubw           xmm2,           xmm5
    195         paddw           xmm6,           xmm1
    196         paddw           xmm6,           xmm2
    197         pmaddwd         xmm1,           xmm1
    198         pmaddwd         xmm2,           xmm2
    199         paddd           xmm7,           xmm1
    200         paddd           xmm7,           xmm2
    201 
    202         movdqa          xmm1,           xmm0
    203         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
    204 
    205 %if ABI_IS_32BIT
    206         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
    207 %else
    208         lea             rdi,            [rdi + r9]
    209 %endif
    210 
    211         sub             rcx,            1
    212         jnz             filter_block2d_bil_sp_only_loop
    213 
    214         jmp             filter_block2d_bil_variance
    215 
    216 filter_block2d_bil_var_ssse3_full_pixel:
    217         mov             rsi,            arg(0)               ;ref_ptr
    218         mov             rdi,            arg(2)               ;src_ptr
    219         movsxd          rcx,            dword ptr arg(4)     ;Height
    220         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
    221         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
    222         pxor            xmm0,           xmm0
    223 
    224 filter_block2d_bil_full_pixel_loop:
    225         movq            xmm1,           QWORD PTR [rsi]
    226         punpcklbw       xmm1,           xmm0
    227         movq            xmm2,           QWORD PTR [rsi+8]
    228         punpcklbw       xmm2,           xmm0
    229 
    230         movq            xmm3,           QWORD PTR [rdi]
    231         punpcklbw       xmm3,           xmm0
    232         movq            xmm4,           QWORD PTR [rdi+8]
    233         punpcklbw       xmm4,           xmm0
    234 
    235         psubw           xmm1,           xmm3
    236         psubw           xmm2,           xmm4
    237         paddw           xmm6,           xmm1
    238         paddw           xmm6,           xmm2
    239         pmaddwd         xmm1,           xmm1
    240         pmaddwd         xmm2,           xmm2
    241         paddd           xmm7,           xmm1
    242         paddd           xmm7,           xmm2
    243 
    244         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
    245         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
    246         sub             rcx,            1
    247         jnz             filter_block2d_bil_full_pixel_loop
    248 
    249         jmp             filter_block2d_bil_variance
    250 
    251 filter_block2d_bil_var_ssse3_fp_only:
    252         mov             rsi,            arg(0)               ;ref_ptr
    253         mov             rdi,            arg(2)               ;src_ptr
    254         movsxd          rcx,            dword ptr arg(4)     ;Height
    255         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
    256 
    257         pxor            xmm0,           xmm0
    258 
    259 %if ABI_IS_32BIT=0
    260         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
    261 %endif
    262 
    263 filter_block2d_bil_fp_only_loop:
    264         movdqu          xmm1,           XMMWORD PTR [rsi]
    265         movdqu          xmm2,           XMMWORD PTR [rsi+1]
    266         movdqa          xmm3,           xmm1
    267 
    268         punpcklbw       xmm1,           xmm2
    269         punpckhbw       xmm3,           xmm2
    270         pmaddubsw       xmm1,           [rax]
    271         pmaddubsw       xmm3,           [rax]
    272 
    273         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
    274         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
    275         psraw           xmm1,           xmm_filter_shift
    276         psraw           xmm3,           xmm_filter_shift
    277 
    278         movq            xmm2,           XMMWORD PTR [rdi]
    279         pxor            xmm4,           xmm4
    280         punpcklbw       xmm2,           xmm4
    281         movq            xmm5,           QWORD PTR [rdi+8]
    282         punpcklbw       xmm5,           xmm4
    283 
    284         psubw           xmm1,           xmm2
    285         psubw           xmm3,           xmm5
    286         paddw           xmm6,           xmm1
    287         paddw           xmm6,           xmm3
    288         pmaddwd         xmm1,           xmm1
    289         pmaddwd         xmm3,           xmm3
    290         paddd           xmm7,           xmm1
    291         paddd           xmm7,           xmm3
    292 
    293         lea             rsi,            [rsi + rdx]
    294 %if ABI_IS_32BIT
    295         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
    296 %else
    297         lea             rdi,            [rdi + r9]
    298 %endif
    299 
    300         sub             rcx,            1
    301         jnz             filter_block2d_bil_fp_only_loop
    302 
    303         jmp             filter_block2d_bil_variance
    304 
    305 filter_block2d_bil_variance:
    306         pxor        xmm0,           xmm0
    307         pxor        xmm1,           xmm1
    308         pxor        xmm5,           xmm5
    309 
    310         punpcklwd   xmm0,           xmm6
    311         punpckhwd   xmm1,           xmm6
    312         psrad       xmm0,           16
    313         psrad       xmm1,           16
    314         paddd       xmm0,           xmm1
    315         movdqa      xmm1,           xmm0
    316 
    317         movdqa      xmm6,           xmm7
    318         punpckldq   xmm6,           xmm5
    319         punpckhdq   xmm7,           xmm5
    320         paddd       xmm6,           xmm7
    321 
    322         punpckldq   xmm0,           xmm5
    323         punpckhdq   xmm1,           xmm5
    324         paddd       xmm0,           xmm1
    325 
    326         movdqa      xmm7,           xmm6
    327         movdqa      xmm1,           xmm0
    328 
    329         psrldq      xmm7,           8
    330         psrldq      xmm1,           8
    331 
    332         paddd       xmm6,           xmm7
    333         paddd       xmm0,           xmm1
    334 
    335         mov         rsi,            arg(7) ;[Sum]
    336         mov         rdi,            arg(8) ;[SSE]
    337 
    338         movd        [rsi],       xmm0
    339         movd        [rdi],       xmm6
    340 
    341     ; begin epilog
    342     pop rdi
    343     pop rsi
    344     RESTORE_GOT
    345     RESTORE_XMM
    346     UNSHADOW_ARGS
    347     pop         rbp
    348     ret
    349 
    350 
    351 SECTION_RODATA
    352 align 16
    353 xmm_bi_rd:
    354     times 8 dw 64
    355 align 16
    356 vp8_bilinear_filters_ssse3:
    357     times 8 db 128, 0
    358     times 8 db 112, 16
    359     times 8 db 96,  32
    360     times 8 db 80,  48
    361     times 8 db 64,  64
    362     times 8 db 48,  80
    363     times 8 db 32,  96
    364     times 8 db 16,  112
    365