Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define VP8_FILTER_WEIGHT 128
     15 %define VP8_FILTER_SHIFT  7
     16 
     17 ;void vp8_post_proc_down_and_across_mmx
     18 ;(
     19 ;    unsigned char *src_ptr,
     20 ;    unsigned char *dst_ptr,
     21 ;    int src_pixels_per_line,
     22 ;    int dst_pixels_per_line,
     23 ;    int rows,
     24 ;    int cols,
     25 ;    int flimit
     26 ;)
     27 global sym(vp8_post_proc_down_and_across_mmx)
     28 sym(vp8_post_proc_down_and_across_mmx):
     29     push        rbp
     30     mov         rbp, rsp
     31     SHADOW_ARGS_TO_STACK 7
     32     GET_GOT     rbx
     33     push        rsi
     34     push        rdi
     35     ; end prolog
     36 
     37 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
     38     ; move the global rd onto the stack, since we don't have enough registers
     39     ; to do PIC addressing
     40     movq        mm0, [GLOBAL(rd)]
     41     sub         rsp, 8
     42     movq        [rsp], mm0
     43 %define RD [rsp]
     44 %else
     45 %define RD [GLOBAL(rd)]
     46 %endif
     47 
     48         push        rbx
     49         lea         rbx, [GLOBAL(Blur)]
     50         movd        mm2, dword ptr arg(6) ;flimit
     51         punpcklwd   mm2, mm2
     52         punpckldq   mm2, mm2
     53 
     54         mov         rsi,        arg(0) ;src_ptr
     55         mov         rdi,        arg(1) ;dst_ptr
     56 
     57         movsxd      rcx, DWORD PTR arg(4) ;rows
     58         movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
     59         pxor        mm0, mm0              ; mm0 = 00000000
     60 
     61 nextrow:
     62 
     63         xor         rdx,        rdx       ; clear out rdx for use as loop counter
     64 nextcol:
     65 
     66         pxor        mm7, mm7              ; mm7 = 00000000
     67         movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
     68         movq        mm3, [rsi]            ; mm4 = r0 p0..p7
     69         punpcklbw   mm3, mm0              ; mm3 = p0..p3
     70         movq        mm1, mm3              ; mm1 = p0..p3
     71         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
     72 
     73         movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
     74         movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
     75         punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
     76         pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
     77         paddusw     mm3, mm6              ; mm3 += mm6
     78 
     79         ; thresholding
     80         movq        mm7, mm1              ; mm7 = r0 p0..p3
     81         psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
     82         psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
     83         paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
     84         pcmpgtw     mm7, mm2
     85 
     86         movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
     87         movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
     88         punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
     89         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
     90         paddusw     mm3, mm6              ; mm3 += mm5
     91 
     92         ; thresholding
     93         movq        mm6, mm1              ; mm6 = r0 p0..p3
     94         psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
     95         psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
     96         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
     97         pcmpgtw     mm6, mm2
     98         por         mm7, mm6              ; accumulate thresholds
     99 
    100 
    101         neg         rax
    102         movq        mm6, [rbx ]           ; kernel 0 taps
    103         movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
    104         punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
    105         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
    106         paddusw     mm3, mm6              ; mm3 += mm5
    107 
    108         ; thresholding
    109         movq        mm6, mm1              ; mm6 = r0 p0..p3
    110         psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
    111         psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
    112         paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
    113         pcmpgtw     mm6, mm2
    114         por         mm7, mm6              ; accumulate thresholds
    115 
    116         movq        mm6, [rbx + 16]       ; kernel 1 taps
    117         movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
    118         punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
    119         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
    120         paddusw     mm3, mm6              ; mm3 += mm5
    121 
    122         ; thresholding
    123         movq        mm6, mm1              ; mm6 = r0 p0..p3
    124         psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
    125         psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
    126         paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
    127         pcmpgtw     mm6, mm2
    128         por         mm7, mm6              ; accumulate thresholds
    129 
    130 
    131         paddusw     mm3, RD               ; mm3 += round value
    132         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
    133 
    134         pand        mm1, mm7              ; mm1 select vals > thresh from source
    135         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
    136         paddusw     mm1, mm7              ; combination
    137 
    138         packuswb    mm1, mm0              ; pack to bytes
    139 
    140         movd        [rdi], mm1            ;
    141         neg         rax                   ; pitch is positive
    142 
    143 
    144         add         rsi, 4
    145         add         rdi, 4
    146         add         rdx, 4
    147 
    148         cmp         edx, dword ptr arg(5) ;cols
    149         jl          nextcol
    150         ; done with the all cols, start the across filtering in place
    151         sub         rsi, rdx
    152         sub         rdi, rdx
    153 
    154 
    155         push        rax
    156         xor         rdx,    rdx
    157         mov         rax,    [rdi-4];
    158 
    159 acrossnextcol:
    160         pxor        mm7, mm7              ; mm7 = 00000000
    161         movq        mm6, [rbx + 32 ]      ;
    162         movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
    163         movq        mm3, mm4              ; mm3 = p0..p7
    164         punpcklbw   mm3, mm0              ; mm3 = p0..p3
    165         movq        mm1, mm3              ; mm1 = p0..p3
    166         pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
    167 
    168         movq        mm6, [rbx + 48]
    169         psrlq       mm4, 8                ; mm4 = p1..p7
    170         movq        mm5, mm4              ; mm5 = p1..p7
    171         punpcklbw   mm5, mm0              ; mm5 = p1..p4
    172         pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
    173         paddusw     mm3, mm6              ; mm3 += mm6
    174 
    175         ; thresholding
    176         movq        mm7, mm1              ; mm7 = p0..p3
    177         psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
    178         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
    179         paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
    180         pcmpgtw     mm7, mm2
    181 
    182         movq        mm6, [rbx + 64 ]
    183         psrlq       mm4, 8                ; mm4 = p2..p7
    184         movq        mm5, mm4              ; mm5 = p2..p7
    185         punpcklbw   mm5, mm0              ; mm5 = p2..p5
    186         pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
    187         paddusw     mm3, mm6              ; mm3 += mm5
    188 
    189         ; thresholding
    190         movq        mm6, mm1              ; mm6 = p0..p3
    191         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
    192         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
    193         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
    194         pcmpgtw     mm6, mm2
    195         por         mm7, mm6              ; accumulate thresholds
    196 
    197 
    198         movq        mm6, [rbx ]
    199         movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
    200         movq        mm5, mm4              ; mm5 = p-2..p5
    201         punpcklbw   mm5, mm0              ; mm5 = p-2..p1
    202         pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
    203         paddusw     mm3, mm6              ; mm3 += mm5
    204 
    205         ; thresholding
    206         movq        mm6, mm1              ; mm6 = p0..p3
    207         psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
    208         psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
    209         paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
    210         pcmpgtw     mm6, mm2
    211         por         mm7, mm6              ; accumulate thresholds
    212 
    213         movq        mm6, [rbx + 16]
    214         psrlq       mm4, 8                ; mm4 = p-1..p5
    215         punpcklbw   mm4, mm0              ; mm4 = p-1..p2
    216         pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
    217         paddusw     mm3, mm6              ; mm3 += mm5
    218 
    219         ; thresholding
    220         movq        mm6, mm1              ; mm6 = p0..p3
    221         psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
    222         psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
    223         paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
    224         pcmpgtw     mm6, mm2
    225         por         mm7, mm6              ; accumulate thresholds
    226 
    227         paddusw     mm3, RD               ; mm3 += round value
    228         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
    229 
    230         pand        mm1, mm7              ; mm1 select vals > thresh from source
    231         pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
    232         paddusw     mm1, mm7              ; combination
    233 
    234         packuswb    mm1, mm0              ; pack to bytes
    235         mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
    236         movd        eax,    mm1
    237 
    238         add         rdx, 4
    239         cmp         edx, dword ptr arg(5) ;cols
    240         jl          acrossnextcol;
    241 
    242         mov         DWORD PTR [rdi+rdx-4],  eax
    243         pop         rax
    244 
    245         ; done with this rwo
    246         add         rsi,rax               ; next line
    247         movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
    248         add         rdi,rax               ; next destination
    249         movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
    250 
    251         dec         rcx                   ; decrement count
    252         jnz         nextrow               ; next row
    253         pop         rbx
    254 
    255     ; begin epilog
    256     pop rdi
    257     pop rsi
    258     RESTORE_GOT
    259     UNSHADOW_ARGS
    260     pop         rbp
    261     ret
    262 %undef RD
    263 
    264 
    265 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
    266 ;                             int pitch, int rows, int cols,int flimit)
    267 extern sym(vp8_rv)
    268 global sym(vp8_mbpost_proc_down_mmx)
    269 sym(vp8_mbpost_proc_down_mmx):
    270     push        rbp
    271     mov         rbp, rsp
    272     SHADOW_ARGS_TO_STACK 5
    273     GET_GOT     rbx
    274     push        rsi
    275     push        rdi
    276     ; end prolog
    277 
    278     ALIGN_STACK 16, rax
    279     sub         rsp, 136
    280 
    281     ; unsigned char d[16][8] at [rsp]
    282     ; create flimit2 at [rsp+128]
    283     mov         eax, dword ptr arg(4) ;flimit
    284     mov         [rsp+128], eax
    285     mov         [rsp+128+4], eax
    286 %define flimit2 [rsp+128]
    287 
    288 %if ABI_IS_32BIT=0
    289     lea         r8,       [GLOBAL(sym(vp8_rv))]
    290 %endif
    291 
    292     ;rows +=8;
    293     add         dword ptr arg(2), 8
    294 
    295     ;for(c=0; c<cols; c+=4)
    296 loop_col:
    297             mov         rsi,        arg(0)  ;s
    298             pxor        mm0,        mm0     ;
    299 
    300             movsxd      rax,        dword ptr arg(1) ;pitch       ;
    301             neg         rax                                     ; rax = -pitch
    302 
    303             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
    304             neg         rax
    305 
    306 
    307             pxor        mm5,        mm5
    308             pxor        mm6,        mm6     ;
    309 
    310             pxor        mm7,        mm7     ;
    311             mov         rdi,        rsi
    312 
    313             mov         rcx,        15          ;
    314 
    315 loop_initvar:
    316             movd        mm1,        DWORD PTR [rdi];
    317             punpcklbw   mm1,        mm0     ;
    318 
    319             paddw       mm5,        mm1     ;
    320             pmullw      mm1,        mm1     ;
    321 
    322             movq        mm2,        mm1     ;
    323             punpcklwd   mm1,        mm0     ;
    324 
    325             punpckhwd   mm2,        mm0     ;
    326             paddd       mm6,        mm1     ;
    327 
    328             paddd       mm7,        mm2     ;
    329             lea         rdi,        [rdi+rax]   ;
    330 
    331             dec         rcx
    332             jne         loop_initvar
    333             ;save the var and sum
    334             xor         rdx,        rdx
    335 loop_row:
    336             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
    337             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
    338 
    339             punpcklbw   mm1,        mm0
    340             punpcklbw   mm2,        mm0
    341 
    342             paddw       mm5,        mm2
    343             psubw       mm5,        mm1
    344 
    345             pmullw      mm2,        mm2
    346             movq        mm4,        mm2
    347 
    348             punpcklwd   mm2,        mm0
    349             punpckhwd   mm4,        mm0
    350 
    351             paddd       mm6,        mm2
    352             paddd       mm7,        mm4
    353 
    354             pmullw      mm1,        mm1
    355             movq        mm2,        mm1
    356 
    357             punpcklwd   mm1,        mm0
    358             psubd       mm6,        mm1
    359 
    360             punpckhwd   mm2,        mm0
    361             psubd       mm7,        mm2
    362 
    363 
    364             movq        mm3,        mm6
    365             pslld       mm3,        4
    366 
    367             psubd       mm3,        mm6
    368             movq        mm1,        mm5
    369 
    370             movq        mm4,        mm5
    371             pmullw      mm1,        mm1
    372 
    373             pmulhw      mm4,        mm4
    374             movq        mm2,        mm1
    375 
    376             punpcklwd   mm1,        mm4
    377             punpckhwd   mm2,        mm4
    378 
    379             movq        mm4,        mm7
    380             pslld       mm4,        4
    381 
    382             psubd       mm4,        mm7
    383 
    384             psubd       mm3,        mm1
    385             psubd       mm4,        mm2
    386 
    387             psubd       mm3,        flimit2
    388             psubd       mm4,        flimit2
    389 
    390             psrad       mm3,        31
    391             psrad       mm4,        31
    392 
    393             packssdw    mm3,        mm4
    394             packsswb    mm3,        mm0
    395 
    396             movd        mm1,        DWORD PTR [rsi+rax*8]
    397 
    398             movq        mm2,        mm1
    399             punpcklbw   mm1,        mm0
    400 
    401             paddw       mm1,        mm5
    402             mov         rcx,        rdx
    403 
    404             and         rcx,        127
    405 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    406             push        rax
    407             lea         rax,        [GLOBAL(sym(vp8_rv))]
    408             movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
    409             pop         rax
    410 %elif ABI_IS_32BIT=0
    411             movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
    412 %else
    413             movq        mm4,        [sym(vp8_rv) + rcx*2]
    414 %endif
    415             paddw       mm1,        mm4
    416             ;paddw     xmm1,       eight8s
    417             psraw       mm1,        4
    418 
    419             packuswb    mm1,        mm0
    420             pand        mm1,        mm3
    421 
    422             pandn       mm3,        mm2
    423             por         mm1,        mm3
    424 
    425             and         rcx,        15
    426             movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
    427 
    428             mov         rcx,        rdx
    429             sub         rcx,        8
    430 
    431             and         rcx,        15
    432             movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
    433 
    434             movd        [rsi],      mm1
    435             lea         rsi,        [rsi+rax]
    436 
    437             lea         rdi,        [rdi+rax]
    438             add         rdx,        1
    439 
    440             cmp         edx,        dword arg(2) ;rows
    441             jl          loop_row
    442 
    443 
    444         add         dword arg(0), 4 ; s += 4
    445         sub         dword arg(3), 4 ; cols -= 4
    446         cmp         dword arg(3), 0
    447         jg          loop_col
    448 
    449     add         rsp, 136
    450     pop         rsp
    451 
    452     ; begin epilog
    453     pop rdi
    454     pop rsi
    455     RESTORE_GOT
    456     UNSHADOW_ARGS
    457     pop         rbp
    458     ret
    459 %undef flimit2
    460 
    461 
    462 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
    463 ;                            unsigned char blackclamp[16],
    464 ;                            unsigned char whiteclamp[16],
    465 ;                            unsigned char bothclamp[16],
    466 ;                            unsigned int Width, unsigned int Height, int Pitch)
    467 extern sym(rand)
    468 global sym(vp8_plane_add_noise_mmx)
    469 sym(vp8_plane_add_noise_mmx):
    470     push        rbp
    471     mov         rbp, rsp
    472     SHADOW_ARGS_TO_STACK 8
    473     GET_GOT     rbx
    474     push        rsi
    475     push        rdi
    476     ; end prolog
    477 
    478 addnoise_loop:
    479     call sym(rand) WRT_PLT
    480     mov     rcx, arg(1) ;noise
    481     and     rax, 0xff
    482     add     rcx, rax
    483 
    484     ; we rely on the fact that the clamping vectors are stored contiguously
    485     ; in black/white/both order. Note that we have to reload this here because
    486     ; rdx could be trashed by rand()
    487     mov     rdx, arg(2) ; blackclamp
    488 
    489 
    490             mov     rdi, rcx
    491             movsxd  rcx, dword arg(5) ;[Width]
    492             mov     rsi, arg(0) ;Pos
    493             xor         rax,rax
    494 
    495 addnoise_nextset:
    496             movq        mm1,[rsi+rax]         ; get the source
    497 
    498             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
    499             paddusb     mm1, [rdx+32] ;bothclamp
    500             psubusb     mm1, [rdx+16] ;whiteclamp
    501 
    502             movq        mm2,[rdi+rax]         ; get the noise for this line
    503             paddb       mm1,mm2              ; add it in
    504             movq        [rsi+rax],mm1         ; store the result
    505 
    506             add         rax,8                 ; move to the next line
    507 
    508             cmp         rax, rcx
    509             jl          addnoise_nextset
    510 
    511     movsxd  rax, dword arg(7) ; Pitch
    512     add     arg(0), rax ; Start += Pitch
    513     sub     dword arg(6), 1   ; Height -= 1
    514     jg      addnoise_loop
    515 
    516     ; begin epilog
    517     pop rdi
    518     pop rsi
    519     RESTORE_GOT
    520     UNSHADOW_ARGS
    521     pop         rbp
    522     ret
    523 
    524 
    525 SECTION_RODATA
    526 align 16
    527 Blur:
    528     times 16 dw 16
    529     times  8 dw 64
    530     times 16 dw 16
    531     times  8 dw  0
    532 
    533 rd:
    534     times 4 dw 0x40
    535