Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 %define VP8_FILTER_WEIGHT 128
     15 %define VP8_FILTER_SHIFT  7
     16 
     17 ;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
     18 ;                             int pitch, int rows, int cols,int flimit)
     19 extern sym(vp8_rv)
     20 global sym(vp8_mbpost_proc_down_mmx) PRIVATE
     21 sym(vp8_mbpost_proc_down_mmx):
     22     push        rbp
     23     mov         rbp, rsp
     24     SHADOW_ARGS_TO_STACK 5
     25     GET_GOT     rbx
     26     push        rsi
     27     push        rdi
     28     ; end prolog
     29 
     30     ALIGN_STACK 16, rax
     31     sub         rsp, 136
     32 
     33     ; unsigned char d[16][8] at [rsp]
     34     ; create flimit2 at [rsp+128]
     35     mov         eax, dword ptr arg(4) ;flimit
     36     mov         [rsp+128], eax
     37     mov         [rsp+128+4], eax
     38 %define flimit2 [rsp+128]
     39 
     40 %if ABI_IS_32BIT=0
     41     lea         r8,       [GLOBAL(sym(vp8_rv))]
     42 %endif
     43 
     44     ;rows +=8;
     45     add         dword ptr arg(2), 8
     46 
     47     ;for(c=0; c<cols; c+=4)
     48 .loop_col:
     49             mov         rsi,        arg(0)  ;s
     50             pxor        mm0,        mm0     ;
     51 
     52             movsxd      rax,        dword ptr arg(1) ;pitch       ;
     53 
     54             ; this copies the last row down into the border 8 rows
     55             mov         rdi,        rsi
     56             mov         rdx,        arg(2)
     57             sub         rdx,        9
     58             imul        rdx,        rax
     59             lea         rdi,        [rdi+rdx]
     60             movq        mm1,        QWORD ptr[rdi]              ; first row
     61             mov         rcx,        8
     62 .init_borderd                                                    ; initialize borders
     63             lea         rdi,        [rdi + rax]
     64             movq        [rdi],      mm1
     65 
     66             dec         rcx
     67             jne         .init_borderd
     68 
     69             neg         rax                                     ; rax = -pitch
     70 
     71             ; this copies the first row up into the border 8 rows
     72             mov         rdi,        rsi
     73             movq        mm1,        QWORD ptr[rdi]              ; first row
     74             mov         rcx,        8
     75 .init_border                                                    ; initialize borders
     76             lea         rdi,        [rdi + rax]
     77             movq        [rdi],      mm1
     78 
     79             dec         rcx
     80             jne         .init_border
     81 
     82 
     83             lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
     84             neg         rax
     85 
     86 
     87             pxor        mm5,        mm5
     88             pxor        mm6,        mm6     ;
     89 
     90             pxor        mm7,        mm7     ;
     91             mov         rdi,        rsi
     92 
     93             mov         rcx,        15          ;
     94 
     95 .loop_initvar:
     96             movd        mm1,        DWORD PTR [rdi];
     97             punpcklbw   mm1,        mm0     ;
     98 
     99             paddw       mm5,        mm1     ;
    100             pmullw      mm1,        mm1     ;
    101 
    102             movq        mm2,        mm1     ;
    103             punpcklwd   mm1,        mm0     ;
    104 
    105             punpckhwd   mm2,        mm0     ;
    106             paddd       mm6,        mm1     ;
    107 
    108             paddd       mm7,        mm2     ;
    109             lea         rdi,        [rdi+rax]   ;
    110 
    111             dec         rcx
    112             jne         .loop_initvar
    113             ;save the var and sum
    114             xor         rdx,        rdx
    115 .loop_row:
    116             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
    117             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
    118 
    119             punpcklbw   mm1,        mm0
    120             punpcklbw   mm2,        mm0
    121 
    122             paddw       mm5,        mm2
    123             psubw       mm5,        mm1
    124 
    125             pmullw      mm2,        mm2
    126             movq        mm4,        mm2
    127 
    128             punpcklwd   mm2,        mm0
    129             punpckhwd   mm4,        mm0
    130 
    131             paddd       mm6,        mm2
    132             paddd       mm7,        mm4
    133 
    134             pmullw      mm1,        mm1
    135             movq        mm2,        mm1
    136 
    137             punpcklwd   mm1,        mm0
    138             psubd       mm6,        mm1
    139 
    140             punpckhwd   mm2,        mm0
    141             psubd       mm7,        mm2
    142 
    143 
    144             movq        mm3,        mm6
    145             pslld       mm3,        4
    146 
    147             psubd       mm3,        mm6
    148             movq        mm1,        mm5
    149 
    150             movq        mm4,        mm5
    151             pmullw      mm1,        mm1
    152 
    153             pmulhw      mm4,        mm4
    154             movq        mm2,        mm1
    155 
    156             punpcklwd   mm1,        mm4
    157             punpckhwd   mm2,        mm4
    158 
    159             movq        mm4,        mm7
    160             pslld       mm4,        4
    161 
    162             psubd       mm4,        mm7
    163 
    164             psubd       mm3,        mm1
    165             psubd       mm4,        mm2
    166 
    167             psubd       mm3,        flimit2
    168             psubd       mm4,        flimit2
    169 
    170             psrad       mm3,        31
    171             psrad       mm4,        31
    172 
    173             packssdw    mm3,        mm4
    174             packsswb    mm3,        mm0
    175 
    176             movd        mm1,        DWORD PTR [rsi+rax*8]
    177 
    178             movq        mm2,        mm1
    179             punpcklbw   mm1,        mm0
    180 
    181             paddw       mm1,        mm5
    182             mov         rcx,        rdx
    183 
    184             and         rcx,        127
    185 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
    186             push        rax
    187             lea         rax,        [GLOBAL(sym(vp8_rv))]
    188             movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
    189             pop         rax
    190 %elif ABI_IS_32BIT=0
    191             movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
    192 %else
    193             movq        mm4,        [sym(vp8_rv) + rcx*2]
    194 %endif
    195             paddw       mm1,        mm4
    196             psraw       mm1,        4
    197 
    198             packuswb    mm1,        mm0
    199             pand        mm1,        mm3
    200 
    201             pandn       mm3,        mm2
    202             por         mm1,        mm3
    203 
    204             and         rcx,        15
    205             movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
    206 
    207             cmp         edx,        8
    208             jl          .skip_assignment
    209 
    210             mov         rcx,        rdx
    211             sub         rcx,        8
    212             and         rcx,        15
    213             movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
    214             movd        [rsi],      mm1
    215 
    216 .skip_assignment
    217             lea         rsi,        [rsi+rax]
    218 
    219             lea         rdi,        [rdi+rax]
    220             add         rdx,        1
    221 
    222             cmp         edx,        dword arg(2) ;rows
    223             jl          .loop_row
    224 
    225 
    226         add         dword arg(0), 4 ; s += 4
    227         sub         dword arg(3), 4 ; cols -= 4
    228         cmp         dword arg(3), 0
    229         jg          .loop_col
    230 
    231     add         rsp, 136
    232     pop         rsp
    233 
    234     ; begin epilog
    235     pop rdi
    236     pop rsi
    237     RESTORE_GOT
    238     UNSHADOW_ARGS
    239     pop         rbp
    240     ret
    241 %undef flimit2
    242 
    243 
    244 ;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
    245 ;                            unsigned char blackclamp[16],
    246 ;                            unsigned char whiteclamp[16],
    247 ;                            unsigned char bothclamp[16],
    248 ;                            unsigned int Width, unsigned int Height, int Pitch)
    249 global sym(vp8_plane_add_noise_mmx) PRIVATE
    250 sym(vp8_plane_add_noise_mmx):
    251     push        rbp
    252     mov         rbp, rsp
    253     SHADOW_ARGS_TO_STACK 8
    254     GET_GOT     rbx
    255     push        rsi
    256     push        rdi
    257     ; end prolog
    258 
    259 .addnoise_loop:
    260     call sym(LIBVPX_RAND) WRT_PLT
    261     mov     rcx, arg(1) ;noise
    262     and     rax, 0xff
    263     add     rcx, rax
    264 
    265     ; we rely on the fact that the clamping vectors are stored contiguously
    266     ; in black/white/both order. Note that we have to reload this here because
    267     ; rdx could be trashed by rand()
    268     mov     rdx, arg(2) ; blackclamp
    269 
    270 
    271             mov     rdi, rcx
    272             movsxd  rcx, dword arg(5) ;[Width]
    273             mov     rsi, arg(0) ;Pos
    274             xor         rax,rax
    275 
    276 .addnoise_nextset:
    277             movq        mm1,[rsi+rax]         ; get the source
    278 
    279             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
    280             paddusb     mm1, [rdx+32] ;bothclamp
    281             psubusb     mm1, [rdx+16] ;whiteclamp
    282 
    283             movq        mm2,[rdi+rax]         ; get the noise for this line
    284             paddb       mm1,mm2              ; add it in
    285             movq        [rsi+rax],mm1         ; store the result
    286 
    287             add         rax,8                 ; move to the next line
    288 
    289             cmp         rax, rcx
    290             jl          .addnoise_nextset
    291 
    292     movsxd  rax, dword arg(7) ; Pitch
    293     add     arg(0), rax ; Start += Pitch
    294     sub     dword arg(6), 1   ; Height -= 1
    295     jg      .addnoise_loop
    296 
    297     ; begin epilog
    298     pop rdi
    299     pop rsi
    300     RESTORE_GOT
    301     UNSHADOW_ARGS
    302     pop         rbp
    303     ret
    304 
    305 
    306 SECTION_RODATA
    307 align 16
    308 Blur:
    309     times 16 dw 16
    310     times  8 dw 64
    311     times 16 dw 16
    312     times  8 dw  0
    313 
    314 rd:
    315     times 4 dw 0x40
    316