Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ;macro in deblock functions
     15 %macro FIRST_2_ROWS 0
     16         movdqa      xmm4,       xmm0
     17         movdqa      xmm6,       xmm0
     18         movdqa      xmm5,       xmm1
     19         pavgb       xmm5,       xmm3
     20 
     21         ;calculate absolute value
     22         psubusb     xmm4,       xmm1
     23         psubusb     xmm1,       xmm0
     24         psubusb     xmm6,       xmm3
     25         psubusb     xmm3,       xmm0
     26         paddusb     xmm4,       xmm1
     27         paddusb     xmm6,       xmm3
     28 
     29         ;get threshold
     30         movdqa      xmm2,       flimit
     31         pxor        xmm1,       xmm1
     32         movdqa      xmm7,       xmm2
     33 
     34         ;get mask
     35         psubusb     xmm2,       xmm4
     36         psubusb     xmm7,       xmm6
     37         pcmpeqb     xmm2,       xmm1
     38         pcmpeqb     xmm7,       xmm1
     39         por         xmm7,       xmm2
     40 %endmacro
     41 
     42 %macro SECOND_2_ROWS 0
     43         movdqa      xmm6,       xmm0
     44         movdqa      xmm4,       xmm0
     45         movdqa      xmm2,       xmm1
     46         pavgb       xmm1,       xmm3
     47 
     48         ;calculate absolute value
     49         psubusb     xmm6,       xmm2
     50         psubusb     xmm2,       xmm0
     51         psubusb     xmm4,       xmm3
     52         psubusb     xmm3,       xmm0
     53         paddusb     xmm6,       xmm2
     54         paddusb     xmm4,       xmm3
     55 
     56         pavgb       xmm5,       xmm1
     57 
     58         ;get threshold
     59         movdqa      xmm2,       flimit
     60         pxor        xmm1,       xmm1
     61         movdqa      xmm3,       xmm2
     62 
     63         ;get mask
     64         psubusb     xmm2,       xmm6
     65         psubusb     xmm3,       xmm4
     66         pcmpeqb     xmm2,       xmm1
     67         pcmpeqb     xmm3,       xmm1
     68 
     69         por         xmm7,       xmm2
     70         por         xmm7,       xmm3
     71 
     72         pavgb       xmm5,       xmm0
     73 
     74         ;decide if or not to use filtered value
     75         pand        xmm0,       xmm7
     76         pandn       xmm7,       xmm5
     77         paddusb     xmm0,       xmm7
     78 %endmacro
     79 
     80 %macro UPDATE_FLIMIT 0
     81         movdqu      xmm2,       XMMWORD PTR [rbx]
     82         movdqu      [rsp],      xmm2
     83         add         rbx,        16
     84 %endmacro
     85 
     86 SECTION .text
     87 
     88 ;void vpx_post_proc_down_and_across_mb_row_sse2
     89 ;(
     90 ;    unsigned char *src_ptr,
     91 ;    unsigned char *dst_ptr,
     92 ;    int src_pixels_per_line,
     93 ;    int dst_pixels_per_line,
     94 ;    int cols,
     95 ;    int *flimits,
     96 ;    int size
     97 ;)
     98 global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
     99 sym(vpx_post_proc_down_and_across_mb_row_sse2):
    100     push        rbp
    101     mov         rbp, rsp
    102     SHADOW_ARGS_TO_STACK 7
    103     SAVE_XMM 7
    104     push        rbx
    105     push        rsi
    106     push        rdi
    107     ; end prolog
    108     ALIGN_STACK 16, rax
    109     sub         rsp, 16
    110 
    111         ; put flimit on stack
    112         mov         rbx,        arg(5)           ;flimits ptr
    113         UPDATE_FLIMIT
    114 
    115 %define flimit [rsp]
    116 
    117         mov         rsi,        arg(0)           ;src_ptr
    118         mov         rdi,        arg(1)           ;dst_ptr
    119 
    120         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
    121         movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
    122 .nextrow:
    123         xor         rdx,        rdx              ;col
    124 .nextcol:
    125         ;load current and next 2 rows
    126         movdqu      xmm0,       XMMWORD PTR [rsi]
    127         movdqu      xmm1,       XMMWORD PTR [rsi + rax]
    128         movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
    129 
    130         FIRST_2_ROWS
    131 
    132         ;load above 2 rows
    133         neg         rax
    134         movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
    135         movdqu      xmm3,       XMMWORD PTR [rsi + rax]
    136 
    137         SECOND_2_ROWS
    138 
    139         movdqu      XMMWORD PTR [rdi], xmm0
    140 
    141         neg         rax                          ; positive stride
    142         add         rsi,        16
    143         add         rdi,        16
    144 
    145         add         rdx,        16
    146         cmp         edx,        dword arg(4)     ;cols
    147         jge         .downdone
    148         UPDATE_FLIMIT
    149         jmp         .nextcol
    150 
    151 .downdone:
    152         ; done with the all cols, start the across filtering in place
    153         sub         rsi,        rdx
    154         sub         rdi,        rdx
    155 
    156         mov         rbx,        arg(5) ; flimits
    157         UPDATE_FLIMIT
    158 
    159         ; dup the first byte into the left border 8 times
    160         movq        mm1,   [rdi]
    161         punpcklbw   mm1,   mm1
    162         punpcklwd   mm1,   mm1
    163         punpckldq   mm1,   mm1
    164         mov         rdx,    -8
    165         movq        [rdi+rdx], mm1
    166 
    167         ; dup the last byte into the right border
    168         movsxd      rdx,    dword arg(4)
    169         movq        mm1,   [rdi + rdx + -1]
    170         punpcklbw   mm1,   mm1
    171         punpcklwd   mm1,   mm1
    172         punpckldq   mm1,   mm1
    173         movq        [rdi+rdx], mm1
    174 
    175         xor         rdx,        rdx
    176         movq        mm0,        QWORD PTR [rdi-16];
    177         movq        mm1,        QWORD PTR [rdi-8];
    178 
    179 .acrossnextcol:
    180         movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
    181         movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
    182         movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
    183 
    184         FIRST_2_ROWS
    185 
    186         movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
    187         movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
    188 
    189         SECOND_2_ROWS
    190 
    191         movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
    192         movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
    193         movdq2q     mm0,        xmm0
    194         psrldq      xmm0,       8
    195         movdq2q     mm1,        xmm0
    196 
    197         add         rdx,        16
    198         cmp         edx,        dword arg(4)     ;cols
    199         jge         .acrossdone
    200         UPDATE_FLIMIT
    201         jmp         .acrossnextcol
    202 
    203 .acrossdone:
    204         ; last 16 pixels
    205         movq        QWORD PTR [rdi+rdx-16], mm0
    206 
    207         cmp         edx,        dword arg(4)
    208         jne         .throw_last_8
    209         movq        QWORD PTR [rdi+rdx-8], mm1
    210 .throw_last_8:
    211         ; done with this rwo
    212         add         rsi,rax                      ;next src line
    213         mov         eax, dword arg(3)            ;dst_pixels_per_line
    214         add         rdi,rax                      ;next destination
    215         mov         eax, dword arg(2)            ;src_pixels_per_line
    216 
    217         mov         rbx,        arg(5)           ;flimits
    218         UPDATE_FLIMIT
    219 
    220         dec         rcx                          ;decrement count
    221         jnz         .nextrow                     ;next row
    222 
    223     add rsp, 16
    224     pop rsp
    225     ; begin epilog
    226     pop rdi
    227     pop rsi
    228     pop rbx
    229     RESTORE_XMM
    230     UNSHADOW_ARGS
    231     pop         rbp
    232     ret
    233 %undef flimit
    234 
    235 ;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
    236 ;                               int pitch, int rows, int cols,int flimit)
    237 extern sym(vpx_rv)
    238 global sym(vpx_mbpost_proc_down_sse2) PRIVATE
    239 sym(vpx_mbpost_proc_down_sse2):
    240     push        rbp
    241     mov         rbp, rsp
    242     SHADOW_ARGS_TO_STACK 5
    243     SAVE_XMM 7
    244     GET_GOT     rbx
    245     push        rsi
    246     push        rdi
    247     ; end prolog
    248 
    249     ALIGN_STACK 16, rax
    250     sub         rsp, 128+16
    251 
    252     ; unsigned char d[16][8] at [rsp]
    253     ; create flimit2 at [rsp+128]
    254     mov         eax, dword ptr arg(4) ;flimit
    255     mov         [rsp+128], eax
    256     mov         [rsp+128+4], eax
    257     mov         [rsp+128+8], eax
    258     mov         [rsp+128+12], eax
    259 %define flimit4 [rsp+128]
    260 
    261 %if ABI_IS_32BIT=0
    262     lea         r8,       [GLOBAL(sym(vpx_rv))]
    263 %endif
    264 
    265     ;rows +=8;
    266     add         dword arg(2), 8
    267 
    268     ;for(c=0; c<cols; c+=8)
    269 .loop_col:
    270             mov         rsi,        arg(0) ; s
    271             pxor        xmm0,       xmm0        ;
    272 
    273             movsxd      rax,        dword ptr arg(1) ;pitch       ;
    274 
    275             ; this copies the last row down into the border 8 rows
    276             mov         rdi,        rsi
    277             mov         rdx,        arg(2)
    278             sub         rdx,        9
    279             imul        rdx,        rax
    280             lea         rdi,        [rdi+rdx]
    281             movq        xmm1,       QWORD ptr[rdi]              ; first row
    282             mov         rcx,        8
    283 .init_borderd:                                                  ; initialize borders
    284             lea         rdi,        [rdi + rax]
    285             movq        [rdi],      xmm1
    286 
    287             dec         rcx
    288             jne         .init_borderd
    289 
    290             neg         rax                                     ; rax = -pitch
    291 
    292             ; this copies the first row up into the border 8 rows
    293             mov         rdi,        rsi
    294             movq        xmm1,       QWORD ptr[rdi]              ; first row
    295             mov         rcx,        8
    296 .init_border:                                                   ; initialize borders
    297             lea         rdi,        [rdi + rax]
    298             movq        [rdi],      xmm1
    299 
    300             dec         rcx
    301             jne         .init_border
    302 
    303 
    304 
    305             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
    306             neg         rax
    307 
    308             pxor        xmm5,       xmm5
    309             pxor        xmm6,       xmm6        ;
    310 
    311             pxor        xmm7,       xmm7        ;
    312             mov         rdi,        rsi
    313 
    314             mov         rcx,        15          ;
    315 
    316 .loop_initvar:
    317             movq        xmm1,       QWORD PTR [rdi];
    318             punpcklbw   xmm1,       xmm0        ;
    319 
    320             paddw       xmm5,       xmm1        ;
    321             pmullw      xmm1,       xmm1        ;
    322 
    323             movdqa      xmm2,       xmm1        ;
    324             punpcklwd   xmm1,       xmm0        ;
    325 
    326             punpckhwd   xmm2,       xmm0        ;
    327             paddd       xmm6,       xmm1        ;
    328 
    329             paddd       xmm7,       xmm2        ;
    330             lea         rdi,        [rdi+rax]   ;
    331 
    332             dec         rcx
    333             jne         .loop_initvar
    334             ;save the var and sum
    335             xor         rdx,        rdx
    336 .loop_row:
    337             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
    338             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
    339 
    340             punpcklbw   xmm1,       xmm0
    341             punpcklbw   xmm2,       xmm0
    342 
    343             paddw       xmm5,       xmm2
    344             psubw       xmm5,       xmm1
    345 
    346             pmullw      xmm2,       xmm2
    347             movdqa      xmm4,       xmm2
    348 
    349             punpcklwd   xmm2,       xmm0
    350             punpckhwd   xmm4,       xmm0
    351 
    352             paddd       xmm6,       xmm2
    353             paddd       xmm7,       xmm4
    354 
    355             pmullw      xmm1,       xmm1
    356             movdqa      xmm2,       xmm1
    357 
    358             punpcklwd   xmm1,       xmm0
    359             psubd       xmm6,       xmm1
    360 
    361             punpckhwd   xmm2,       xmm0
    362             psubd       xmm7,       xmm2
    363 
    364 
    365             movdqa      xmm3,       xmm6
    366             pslld       xmm3,       4
    367 
    368             psubd       xmm3,       xmm6
    369             movdqa      xmm1,       xmm5
    370 
    371             movdqa      xmm4,       xmm5
    372             pmullw      xmm1,       xmm1
    373 
    374             pmulhw      xmm4,       xmm4
    375             movdqa      xmm2,       xmm1
    376 
    377             punpcklwd   xmm1,       xmm4
    378             punpckhwd   xmm2,       xmm4
    379 
    380             movdqa      xmm4,       xmm7
    381             pslld       xmm4,       4
    382 
    383             psubd       xmm4,       xmm7
    384 
    385             psubd       xmm3,       xmm1
    386             psubd       xmm4,       xmm2
    387 
    388             psubd       xmm3,       flimit4
    389             psubd       xmm4,       flimit4
    390 
    391             psrad       xmm3,       31
    392             psrad       xmm4,       31
    393 
    394             packssdw    xmm3,       xmm4
    395             packsswb    xmm3,       xmm0
    396 
    397             movq        xmm1,       QWORD PTR [rsi+rax*8]
    398 
    399             movq        xmm2,       xmm1
    400             punpcklbw   xmm1,       xmm0
    401 
    402             paddw       xmm1,       xmm5
    403             mov         rcx,        rdx
    404 
    405             and         rcx,        127
    406 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    407             push        rax
    408             lea         rax,        [GLOBAL(sym(vpx_rv))]
    409             movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
    410             pop         rax
    411 %elif ABI_IS_32BIT=0
    412             movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
    413 %else
    414             movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
    415 %endif
    416 
    417             paddw       xmm1,       xmm4
    418             ;paddw     xmm1,       eight8s
    419             psraw       xmm1,       4
    420 
    421             packuswb    xmm1,       xmm0
    422             pand        xmm1,       xmm3
    423 
    424             pandn       xmm3,       xmm2
    425             por         xmm1,       xmm3
    426 
    427             and         rcx,        15
    428             movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
    429 
    430             cmp         edx,        8
    431             jl          .skip_assignment
    432 
    433             mov         rcx,        rdx
    434             sub         rcx,        8
    435             and         rcx,        15
    436             movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
    437             movq        [rsi],      mm0
    438 
    439 .skip_assignment:
    440             lea         rsi,        [rsi+rax]
    441 
    442             lea         rdi,        [rdi+rax]
    443             add         rdx,        1
    444 
    445             cmp         edx,        dword arg(2) ;rows
    446             jl          .loop_row
    447 
    448         add         dword arg(0), 8 ; s += 8
    449         sub         dword arg(3), 8 ; cols -= 8
    450         cmp         dword arg(3), 0
    451         jg          .loop_col
    452 
    453     add         rsp, 128+16
    454     pop         rsp
    455 
    456     ; begin epilog
    457     pop rdi
    458     pop rsi
    459     RESTORE_GOT
    460     RESTORE_XMM
    461     UNSHADOW_ARGS
    462     pop         rbp
    463     ret
    464 %undef flimit4
    465 
    466 
    467 ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
    468 ;                                    int pitch, int rows, int cols,int flimit)
    469 global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
    470 sym(vpx_mbpost_proc_across_ip_sse2):
    471     push        rbp
    472     mov         rbp, rsp
    473     SHADOW_ARGS_TO_STACK 5
    474     SAVE_XMM 7
    475     GET_GOT     rbx
    476     push        rsi
    477     push        rdi
    478     ; end prolog
    479 
    480     ALIGN_STACK 16, rax
    481     sub         rsp, 16
    482 
    483     ; create flimit4 at [rsp]
    484     mov         eax, dword ptr arg(4) ;flimit
    485     mov         [rsp], eax
    486     mov         [rsp+4], eax
    487     mov         [rsp+8], eax
    488     mov         [rsp+12], eax
    489 %define flimit4 [rsp]
    490 
    491 
    492     ;for(r=0;r<rows;r++)
    493 .ip_row_loop:
    494 
    495         xor         rdx,    rdx ;sumsq=0;
    496         xor         rcx,    rcx ;sum=0;
    497         mov         rsi,    arg(0); s
    498 
    499 
    500         ; dup the first byte into the left border 8 times
    501         movq        mm1,   [rsi]
    502         punpcklbw   mm1,   mm1
    503         punpcklwd   mm1,   mm1
    504         punpckldq   mm1,   mm1
    505 
    506         mov         rdi,    -8
    507         movq        [rsi+rdi], mm1
    508 
    509         ; dup the last byte into the right border
    510         movsxd      rdx,    dword arg(3)
    511         movq        mm1,   [rsi + rdx + -1]
    512         punpcklbw   mm1,   mm1
    513         punpcklwd   mm1,   mm1
    514         punpckldq   mm1,   mm1
    515         movq        [rsi+rdx], mm1
    516 
    517 .ip_var_loop:
    518         ;for(i=-8;i<=6;i++)
    519         ;{
    520         ;    sumsq += s[i]*s[i];
    521         ;    sum   += s[i];
    522         ;}
    523         movzx       eax, byte [rsi+rdi]
    524         add         ecx, eax
    525         mul         al
    526         add         edx, eax
    527         add         rdi, 1
    528         cmp         rdi, 6
    529         jle         .ip_var_loop
    530 
    531 
    532             ;mov         rax,    sumsq
    533             ;movd        xmm7,   rax
    534             movd        xmm7,   edx
    535 
    536             ;mov         rax,    sum
    537             ;movd        xmm6,   rax
    538             movd        xmm6,   ecx
    539 
    540             mov         rsi,    arg(0) ;s
    541             xor         rcx,    rcx
    542 
    543             movsxd      rdx,    dword arg(3) ;cols
    544             add         rdx,    8
    545             pxor        mm0,    mm0
    546             pxor        mm1,    mm1
    547 
    548             pxor        xmm0,   xmm0
    549 .nextcol4:
    550 
    551             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
    552             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
    553 
    554             punpcklbw   xmm1,   xmm0                    ; expanding
    555             punpcklbw   xmm2,   xmm0                    ; expanding
    556 
    557             punpcklwd   xmm1,   xmm0                    ; expanding to dwords
    558             punpcklwd   xmm2,   xmm0                    ; expanding to dwords
    559 
    560             psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
    561             paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
    562 
    563             paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
    564             pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
    565 
    566             paddd       xmm6,   xmm2
    567             paddd       xmm7,   xmm1
    568 
    569             pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
    570             pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
    571 
    572             psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
    573             psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
    574 
    575             pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
    576             pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
    577 
    578             paddd       xmm6,   xmm4
    579             paddd       xmm7,   xmm3
    580 
    581             pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
    582             pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
    583 
    584             paddd       xmm7,   xmm3
    585             paddd       xmm6,   xmm4
    586 
    587             pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
    588             pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
    589 
    590             paddd       xmm7,   xmm3
    591             paddd       xmm6,   xmm4
    592 
    593             movdqa      xmm3,   xmm6
    594             pmaddwd     xmm3,   xmm3
    595 
    596             movdqa      xmm5,   xmm7
    597             pslld       xmm5,   4
    598 
    599             psubd       xmm5,   xmm7
    600             psubd       xmm5,   xmm3
    601 
    602             psubd       xmm5,   flimit4
    603             psrad       xmm5,   31
    604 
    605             packssdw    xmm5,   xmm0
    606             packsswb    xmm5,   xmm0
    607 
    608             movd        xmm1,   DWORD PTR [rsi+rcx]
    609             movq        xmm2,   xmm1
    610 
    611             punpcklbw   xmm1,   xmm0
    612             punpcklwd   xmm1,   xmm0
    613 
    614             paddd       xmm1,   xmm6
    615             paddd       xmm1,   [GLOBAL(four8s)]
    616 
    617             psrad       xmm1,   4
    618             packssdw    xmm1,   xmm0
    619 
    620             packuswb    xmm1,   xmm0
    621             pand        xmm1,   xmm5
    622 
    623             pandn       xmm5,   xmm2
    624             por         xmm5,   xmm1
    625 
    626             movd        [rsi+rcx-8],  mm0
    627             movq        mm0,    mm1
    628 
    629             movdq2q     mm1,    xmm5
    630             psrldq      xmm7,   12
    631 
    632             psrldq      xmm6,   12
    633             add         rcx,    4
    634 
    635             cmp         rcx,    rdx
    636             jl          .nextcol4
    637 
    638         ;s+=pitch;
    639         movsxd rax, dword arg(1)
    640         add    arg(0), rax
    641 
    642         sub dword arg(2), 1 ;rows-=1
    643         cmp dword arg(2), 0
    644         jg .ip_row_loop
    645 
    646     add         rsp, 16
    647     pop         rsp
    648 
    649     ; begin epilog
    650     pop rdi
    651     pop rsi
    652     RESTORE_GOT
    653     RESTORE_XMM
    654     UNSHADOW_ARGS
    655     pop         rbp
    656     ret
    657 %undef flimit4
    658 
    659 
    660 SECTION_RODATA
    661 align 16
    662 four8s:
    663     times 4 dd 8
    664