Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 
     12 %include "vpx_ports/x86_abi_support.asm"
     13 
     14 ; void vp8_temporal_filter_apply_sse2 | arg
     15 ;  (unsigned char  *frame1,           |  0
     16 ;   unsigned int    stride,           |  1
     17 ;   unsigned char  *frame2,           |  2
     18 ;   unsigned int    block_size,       |  3
     19 ;   int             strength,         |  4
     20 ;   int             filter_weight,    |  5
     21 ;   unsigned int   *accumulator,      |  6
     22 ;   unsigned short *count)            |  7
     23 global sym(vp8_temporal_filter_apply_sse2) PRIVATE
     24 sym(vp8_temporal_filter_apply_sse2):
     25 
     26     push        rbp
     27     mov         rbp, rsp
     28     SHADOW_ARGS_TO_STACK 8
     29     SAVE_XMM 7
     30     GET_GOT     rbx
     31     push        rsi
     32     push        rdi
     33     ALIGN_STACK 16, rax
     34     %define block_size    0
     35     %define strength      16
     36     %define filter_weight 32
     37     %define rounding_bit  48
     38     %define rbp_backup    64
     39     %define stack_size    80
     40     sub         rsp,           stack_size
     41     mov         [rsp + rbp_backup], rbp
     42     ; end prolog
     43 
     44         mov         rdx,            arg(3)
     45         mov         [rsp + block_size], rdx
     46         movd        xmm6,            arg(4)
     47         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
     48 
     49         ; calculate the rounding bit outside the loop
     50         ; 0x8000 >> (16 - strength)
     51         mov         rdx,            16
     52         sub         rdx,            arg(4) ; 16 - strength
     53         movq        xmm4,           rdx    ; can't use rdx w/ shift
     54         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
     55         psrlw       xmm5,           xmm4
     56         movdqa      [rsp + rounding_bit], xmm5
     57 
     58         mov         rsi,            arg(0) ; src/frame1
     59         mov         rdx,            arg(2) ; predictor frame
     60         mov         rdi,            arg(6) ; accumulator
     61         mov         rax,            arg(7) ; count
     62 
     63         ; dup the filter weight and store for later
     64         movd        xmm0,           arg(5) ; filter_weight
     65         pshuflw     xmm0,           xmm0, 0
     66         punpcklwd   xmm0,           xmm0
     67         movdqa      [rsp + filter_weight], xmm0
     68 
     69         mov         rbp,            arg(1) ; stride
     70         pxor        xmm7,           xmm7   ; zero for extraction
     71 
     72         lea         rcx,            [rdx + 16*16*1]
     73         cmp         dword ptr [rsp + block_size], 8
     74         jne         .temporal_filter_apply_load_16
     75         lea         rcx,            [rdx + 8*8*1]
     76 
     77 .temporal_filter_apply_load_8:
     78         movq        xmm0,           [rsi]  ; first row
     79         lea         rsi,            [rsi + rbp] ; += stride
     80         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
     81         movq        xmm1,           [rsi]  ; second row
     82         lea         rsi,            [rsi + rbp] ; += stride
     83         punpcklbw   xmm1,           xmm7   ; src[ 8-15]
     84         jmp         .temporal_filter_apply_load_finished
     85 
     86 .temporal_filter_apply_load_16:
     87         movdqa      xmm0,           [rsi]  ; src (frame1)
     88         lea         rsi,            [rsi + rbp] ; += stride
     89         movdqa      xmm1,           xmm0
     90         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
     91         punpckhbw   xmm1,           xmm7   ; src[ 8-15]
     92 
     93 .temporal_filter_apply_load_finished:
     94         movdqa      xmm2,           [rdx]  ; predictor (frame2)
     95         movdqa      xmm3,           xmm2
     96         punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
     97         punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
     98 
     99         ; modifier = src_byte - pixel_value
    100         psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
    101         psubw       xmm1,           xmm3   ; src - pred[ 8-15]
    102 
    103         ; modifier *= modifier
    104         pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
    105         pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
    106 
    107         ; modifier *= 3
    108         pmullw      xmm0,           [GLOBAL(_const_3w)]
    109         pmullw      xmm1,           [GLOBAL(_const_3w)]
    110 
    111         ; modifer += 0x8000 >> (16 - strength)
    112         paddw       xmm0,           [rsp + rounding_bit]
    113         paddw       xmm1,           [rsp + rounding_bit]
    114 
    115         ; modifier >>= strength
    116         psrlw       xmm0,           [rsp + strength]
    117         psrlw       xmm1,           [rsp + strength]
    118 
    119         ; modifier = 16 - modifier
    120         ; saturation takes care of modifier > 16
    121         movdqa      xmm3,           [GLOBAL(_const_16w)]
    122         movdqa      xmm2,           [GLOBAL(_const_16w)]
    123         psubusw     xmm3,           xmm1
    124         psubusw     xmm2,           xmm0
    125 
    126         ; modifier *= filter_weight
    127         pmullw      xmm2,           [rsp + filter_weight]
    128         pmullw      xmm3,           [rsp + filter_weight]
    129 
    130         ; count
    131         movdqa      xmm4,           [rax]
    132         movdqa      xmm5,           [rax+16]
    133         ; += modifier
    134         paddw       xmm4,           xmm2
    135         paddw       xmm5,           xmm3
    136         ; write back
    137         movdqa      [rax],          xmm4
    138         movdqa      [rax+16],       xmm5
    139         lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
    140 
    141         ; load and extract the predictor up to shorts
    142         pxor        xmm7,           xmm7
    143         movdqa      xmm0,           [rdx]
    144         lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
    145         movdqa      xmm1,           xmm0
    146         punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
    147         punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
    148 
    149         ; modifier *= pixel_value
    150         pmullw      xmm0,           xmm2
    151         pmullw      xmm1,           xmm3
    152 
    153         ; expand to double words
    154         movdqa      xmm2,           xmm0
    155         punpcklwd   xmm0,           xmm7   ; [ 0- 3]
    156         punpckhwd   xmm2,           xmm7   ; [ 4- 7]
    157         movdqa      xmm3,           xmm1
    158         punpcklwd   xmm1,           xmm7   ; [ 8-11]
    159         punpckhwd   xmm3,           xmm7   ; [12-15]
    160 
    161         ; accumulator
    162         movdqa      xmm4,           [rdi]
    163         movdqa      xmm5,           [rdi+16]
    164         movdqa      xmm6,           [rdi+32]
    165         movdqa      xmm7,           [rdi+48]
    166         ; += modifier
    167         paddd       xmm4,           xmm0
    168         paddd       xmm5,           xmm2
    169         paddd       xmm6,           xmm1
    170         paddd       xmm7,           xmm3
    171         ; write back
    172         movdqa      [rdi],          xmm4
    173         movdqa      [rdi+16],       xmm5
    174         movdqa      [rdi+32],       xmm6
    175         movdqa      [rdi+48],       xmm7
    176         lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
    177 
    178         cmp         rdx,            rcx
    179         je          .temporal_filter_apply_epilog
    180         pxor        xmm7,           xmm7   ; zero for extraction
    181         cmp         dword ptr [rsp + block_size], 16
    182         je          .temporal_filter_apply_load_16
    183         jmp         .temporal_filter_apply_load_8
    184 
    185 .temporal_filter_apply_epilog:
    186     ; begin epilog
    187     mov         rbp,            [rsp + rbp_backup]
    188     add         rsp,            stack_size
    189     pop         rsp
    190     pop         rdi
    191     pop         rsi
    192     RESTORE_GOT
    193     RESTORE_XMM
    194     UNSHADOW_ARGS
    195     pop         rbp
    196     ret
    197 
    198 SECTION_RODATA
    199 align 16
    200 _const_3w:
    201     times 8 dw 3
    202 align 16
    203 _const_top_bit:
    204     times 8 dw 1<<15
    205 align 16
    206 _const_16w
    207     times 8 dw 16
    208