Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license
      5 ;  that can be found in the LICENSE file in the root of the source
      6 ;  tree. An additional intellectual property rights grant can be found
      7 ;  in the file PATENTS.  All contributing project authors may
      8 ;  be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 ;  This file is a duplicate of mfqe_sse2.asm in VP8.
     12 ;  TODO(jackychen): Find a way to fix the duplicate.
     13 %include "vpx_ports/x86_abi_support.asm"
     14 
     15 ;void vp9_filter_by_weight16x16_sse2
     16 ;(
     17 ;    unsigned char *src,
     18 ;    int            src_stride,
     19 ;    unsigned char *dst,
     20 ;    int            dst_stride,
     21 ;    int            src_weight
     22 ;)
     23 global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
     24 sym(vp9_filter_by_weight16x16_sse2):
     25     push        rbp
     26     mov         rbp, rsp
     27     SHADOW_ARGS_TO_STACK 5
     28     SAVE_XMM 6
     29     GET_GOT     rbx
     30     push        rsi
     31     push        rdi
     32     ; end prolog
     33 
     34     movd        xmm0, arg(4)                ; src_weight
     35     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
     36     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
     37 
     38     movdqa      xmm1, [GLOBAL(tMFQE)]
     39     psubw       xmm1, xmm0                  ; dst_weight
     40 
     41     mov         rax, arg(0)                 ; src
     42     mov         rsi, arg(1)                 ; src_stride
     43     mov         rdx, arg(2)                 ; dst
     44     mov         rdi, arg(3)                 ; dst_stride
     45 
     46     mov         rcx, 16                     ; loop count
     47     pxor        xmm6, xmm6
     48 
     49 .combine:
     50     movdqa      xmm2, [rax]
     51     movdqa      xmm4, [rdx]
     52     add         rax, rsi
     53 
     54     ; src * src_weight
     55     movdqa      xmm3, xmm2
     56     punpcklbw   xmm2, xmm6
     57     punpckhbw   xmm3, xmm6
     58     pmullw      xmm2, xmm0
     59     pmullw      xmm3, xmm0
     60 
     61     ; dst * dst_weight
     62     movdqa      xmm5, xmm4
     63     punpcklbw   xmm4, xmm6
     64     punpckhbw   xmm5, xmm6
     65     pmullw      xmm4, xmm1
     66     pmullw      xmm5, xmm1
     67 
     68     ; sum, round and shift
     69     paddw       xmm2, xmm4
     70     paddw       xmm3, xmm5
     71     paddw       xmm2, [GLOBAL(tMFQE_round)]
     72     paddw       xmm3, [GLOBAL(tMFQE_round)]
     73     psrlw       xmm2, 4
     74     psrlw       xmm3, 4
     75 
     76     packuswb    xmm2, xmm3
     77     movdqa      [rdx], xmm2
     78     add         rdx, rdi
     79 
     80     dec         rcx
     81     jnz         .combine
     82 
     83     ; begin epilog
     84     pop         rdi
     85     pop         rsi
     86     RESTORE_GOT
     87     RESTORE_XMM
     88     UNSHADOW_ARGS
     89     pop         rbp
     90 
     91     ret
     92 
     93 ;void vp9_filter_by_weight8x8_sse2
     94 ;(
     95 ;    unsigned char *src,
     96 ;    int            src_stride,
     97 ;    unsigned char *dst,
     98 ;    int            dst_stride,
     99 ;    int            src_weight
    100 ;)
    101 global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
    102 sym(vp9_filter_by_weight8x8_sse2):
    103     push        rbp
    104     mov         rbp, rsp
    105     SHADOW_ARGS_TO_STACK 5
    106     GET_GOT     rbx
    107     push        rsi
    108     push        rdi
    109     ; end prolog
    110 
    111     movd        xmm0, arg(4)                ; src_weight
    112     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    113     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    114 
    115     movdqa      xmm1, [GLOBAL(tMFQE)]
    116     psubw       xmm1, xmm0                  ; dst_weight
    117 
    118     mov         rax, arg(0)                 ; src
    119     mov         rsi, arg(1)                 ; src_stride
    120     mov         rdx, arg(2)                 ; dst
    121     mov         rdi, arg(3)                 ; dst_stride
    122 
    123     mov         rcx, 8                      ; loop count
    124     pxor        xmm4, xmm4
    125 
    126 .combine:
    127     movq        xmm2, [rax]
    128     movq        xmm3, [rdx]
    129     add         rax, rsi
    130 
    131     ; src * src_weight
    132     punpcklbw   xmm2, xmm4
    133     pmullw      xmm2, xmm0
    134 
    135     ; dst * dst_weight
    136     punpcklbw   xmm3, xmm4
    137     pmullw      xmm3, xmm1
    138 
    139     ; sum, round and shift
    140     paddw       xmm2, xmm3
    141     paddw       xmm2, [GLOBAL(tMFQE_round)]
    142     psrlw       xmm2, 4
    143 
    144     packuswb    xmm2, xmm4
    145     movq        [rdx], xmm2
    146     add         rdx, rdi
    147 
    148     dec         rcx
    149     jnz         .combine
    150 
    151     ; begin epilog
    152     pop         rdi
    153     pop         rsi
    154     RESTORE_GOT
    155     UNSHADOW_ARGS
    156     pop         rbp
    157 
    158     ret
    159 
    160 ;void vp9_variance_and_sad_16x16_sse2 | arg
    161 ;(
    162 ;    unsigned char *src1,          0
    163 ;    int            stride1,       1
    164 ;    unsigned char *src2,          2
    165 ;    int            stride2,       3
    166 ;    unsigned int  *variance,      4
    167 ;    unsigned int  *sad,           5
    168 ;)
    169 global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
    170 sym(vp9_variance_and_sad_16x16_sse2):
    171     push        rbp
    172     mov         rbp, rsp
    173     SHADOW_ARGS_TO_STACK 6
    174     GET_GOT     rbx
    175     push        rsi
    176     push        rdi
    177     ; end prolog
    178 
    179     mov         rax,        arg(0)          ; src1
    180     mov         rcx,        arg(1)          ; stride1
    181     mov         rdx,        arg(2)          ; src2
    182     mov         rdi,        arg(3)          ; stride2
    183 
    184     mov         rsi,        16              ; block height
    185 
    186     ; Prep accumulator registers
    187     pxor        xmm3, xmm3                  ; SAD
    188     pxor        xmm4, xmm4                  ; sum of src2
    189     pxor        xmm5, xmm5                  ; sum of src2^2
    190 
    191     ; Because we're working with the actual output frames
    192     ; we can't depend on any kind of data alignment.
    193 .accumulate:
    194     movdqa      xmm0, [rax]                 ; src1
    195     movdqa      xmm1, [rdx]                 ; src2
    196     add         rax, rcx                    ; src1 + stride1
    197     add         rdx, rdi                    ; src2 + stride2
    198 
    199     ; SAD(src1, src2)
    200     psadbw      xmm0, xmm1
    201     paddusw     xmm3, xmm0
    202 
    203     ; SUM(src2)
    204     pxor        xmm2, xmm2
    205     psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
    206     paddusw     xmm4, xmm2
    207 
    208     ; pmaddubsw would be ideal if it took two unsigned values. instead,
    209     ; it expects a signed and an unsigned value. so instead we zero extend
    210     ; and operate on words.
    211     pxor        xmm2, xmm2
    212     movdqa      xmm0, xmm1
    213     punpcklbw   xmm0, xmm2
    214     punpckhbw   xmm1, xmm2
    215     pmaddwd     xmm0, xmm0
    216     pmaddwd     xmm1, xmm1
    217     paddd       xmm5, xmm0
    218     paddd       xmm5, xmm1
    219 
    220     sub         rsi,        1
    221     jnz         .accumulate
    222 
    223     ; phaddd only operates on adjacent double words.
    224     ; Finalize SAD and store
    225     movdqa      xmm0, xmm3
    226     psrldq      xmm0, 8
    227     paddusw     xmm0, xmm3
    228     paddd       xmm0, [GLOBAL(t128)]
    229     psrld       xmm0, 8
    230 
    231     mov         rax,  arg(5)
    232     movd        [rax], xmm0
    233 
    234     ; Accumulate sum of src2
    235     movdqa      xmm0, xmm4
    236     psrldq      xmm0, 8
    237     paddusw     xmm0, xmm4
    238     ; Square src2. Ignore high value
    239     pmuludq     xmm0, xmm0
    240     psrld       xmm0, 8
    241 
    242     ; phaddw could be used to sum adjacent values but we want
    243     ; all the values summed. promote to doubles, accumulate,
    244     ; shift and sum
    245     pxor        xmm2, xmm2
    246     movdqa      xmm1, xmm5
    247     punpckldq   xmm1, xmm2
    248     punpckhdq   xmm5, xmm2
    249     paddd       xmm1, xmm5
    250     movdqa      xmm2, xmm1
    251     psrldq      xmm1, 8
    252     paddd       xmm1, xmm2
    253 
    254     psubd       xmm1, xmm0
    255 
    256     ; (variance + 128) >> 8
    257     paddd       xmm1, [GLOBAL(t128)]
    258     psrld       xmm1, 8
    259     mov         rax,  arg(4)
    260 
    261     movd        [rax], xmm1
    262 
    263 
    264     ; begin epilog
    265     pop         rdi
    266     pop         rsi
    267     RESTORE_GOT
    268     UNSHADOW_ARGS
    269     pop         rbp
    270     ret
    271 
    272 SECTION_RODATA
    273 align 16
    274 t128:
    275 %ifndef __NASM_VER__
    276     ddq 128
    277 %elif CONFIG_BIG_ENDIAN
    278     dq  0, 128
    279 %else
    280     dq  128, 0
    281 %endif
    282 align 16
    283 tMFQE: ; 1 << MFQE_PRECISION
    284     times 8 dw 0x10
    285 align 16
    286 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
    287     times 8 dw 0x08
    288