Home | History | Annotate | Download | only in x86
      1 ;
      2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3 ;
      4 ;  Use of this source code is governed by a BSD-style license and patent
      5 ;  grant that can be found in the LICENSE file in the root of the source
      6 ;  tree. All contributing project authors may be found in the AUTHORS
      7 ;  file in the root of the source tree.
      8 ;
      9 
     10 
     11 %include "vpx_ports/x86_abi_support.asm"
     12 %include "asm_enc_offsets.asm"
     13 
     14 
     15 ; void vp8_regular_quantize_b_sse2 | arg
     16 ;  (BLOCK  *b,                     |  0
     17 ;   BLOCKD *d)                     |  1
     18 
     19 global sym(vp8_regular_quantize_b_sse2)
     20 sym(vp8_regular_quantize_b_sse2):
     21     push        rbp
     22     mov         rbp, rsp
     23     SAVE_XMM
     24     GET_GOT     rbx
     25     push        rsi
     26 
     27 %if ABI_IS_32BIT
     28     push        rdi
     29 %else
     30   %ifidn __OUTPUT_FORMAT__,x64
     31     push        rdi
     32   %endif
     33 %endif
     34 
     35     ALIGN_STACK 16, rax
     36     %define BLOCKD_d          0  ;  8
     37     %define zrun_zbin_boost   8  ;  8
     38     %define abs_minus_zbin    16 ; 32
     39     %define temp_qcoeff       48 ; 32
     40     %define qcoeff            80 ; 32
     41     %define stack_size        112
     42     sub         rsp, stack_size
     43     ; end prolog
     44 
     45 %if ABI_IS_32BIT
     46     mov         rdi, arg(0)
     47 %else
     48   %ifidn __OUTPUT_FORMAT__,x64
     49     mov         rdi, rcx                    ; BLOCK *b
     50     mov         [rsp + BLOCKD_d], rdx
     51   %else
     52     ;mov         rdi, rdi                    ; BLOCK *b
     53     mov         [rsp + BLOCKD_d], rsi
     54   %endif
     55 %endif
     56 
     57     mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
     58     mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
     59     movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
     60 
     61     ; z
     62     movdqa      xmm0, [rdx]
     63     movdqa      xmm4, [rdx + 16]
     64     mov         rdx, [rdi + vp8_block_round] ; round_ptr
     65 
     66     pshuflw     xmm7, xmm7, 0
     67     punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
     68 
     69     movdqa      xmm1, xmm0
     70     movdqa      xmm5, xmm4
     71 
     72     ; sz
     73     psraw       xmm0, 15
     74     psraw       xmm4, 15
     75 
     76     ; (z ^ sz)
     77     pxor        xmm1, xmm0
     78     pxor        xmm5, xmm4
     79 
     80     ; x = abs(z)
     81     psubw       xmm1, xmm0
     82     psubw       xmm5, xmm4
     83 
     84     movdqa      xmm2, [rcx]
     85     movdqa      xmm3, [rcx + 16]
     86     mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
     87 
     88     ; *zbin_ptr + zbin_oq_value
     89     paddw       xmm2, xmm7
     90     paddw       xmm3, xmm7
     91 
     92     ; x - (*zbin_ptr + zbin_oq_value)
     93     psubw       xmm1, xmm2
     94     psubw       xmm5, xmm3
     95     movdqa      [rsp + abs_minus_zbin], xmm1
     96     movdqa      [rsp + abs_minus_zbin + 16], xmm5
     97 
     98     ; add (zbin_ptr + zbin_oq_value) back
     99     paddw       xmm1, xmm2
    100     paddw       xmm5, xmm3
    101 
    102     movdqa      xmm2, [rdx]
    103     movdqa      xmm6, [rdx + 16]
    104 
    105     movdqa      xmm3, [rcx]
    106     movdqa      xmm7, [rcx + 16]
    107 
    108     ; x + round
    109     paddw       xmm1, xmm2
    110     paddw       xmm5, xmm6
    111 
    112     ; y = x * quant_ptr >> 16
    113     pmulhw      xmm3, xmm1
    114     pmulhw      xmm7, xmm5
    115 
    116     ; y += x
    117     paddw       xmm1, xmm3
    118     paddw       xmm5, xmm7
    119 
    120     movdqa      [rsp + temp_qcoeff], xmm1
    121     movdqa      [rsp + temp_qcoeff + 16], xmm5
    122 
    123     pxor        xmm6, xmm6
    124     ; zero qcoeff
    125     movdqa      [rsp + qcoeff], xmm6
    126     movdqa      [rsp + qcoeff + 16], xmm6
    127 
    128     mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
    129     mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
    130     mov         [rsp + zrun_zbin_boost], rsi
    131 
    132 %macro ZIGZAG_LOOP 1
    133     movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
    134 
    135     ; x
    136     movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
    137 
    138     ; if (x >= zbin)
    139     sub         cx, WORD PTR[rsi]           ; x - zbin
    140     lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
    141     jl          rq_zigzag_loop_%1           ; x < zbin
    142 
    143     movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2]
    144 
    145     ; downshift by quant_shift[rdx]
    146     movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
    147     sar         edi, cl                     ; also sets Z bit
    148     je          rq_zigzag_loop_%1           ; !y
    149     mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
    150     mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
    151 rq_zigzag_loop_%1:
    152 %endmacro
    153 ZIGZAG_LOOP 0
    154 ZIGZAG_LOOP 1
    155 ZIGZAG_LOOP 2
    156 ZIGZAG_LOOP 3
    157 ZIGZAG_LOOP 4
    158 ZIGZAG_LOOP 5
    159 ZIGZAG_LOOP 6
    160 ZIGZAG_LOOP 7
    161 ZIGZAG_LOOP 8
    162 ZIGZAG_LOOP 9
    163 ZIGZAG_LOOP 10
    164 ZIGZAG_LOOP 11
    165 ZIGZAG_LOOP 12
    166 ZIGZAG_LOOP 13
    167 ZIGZAG_LOOP 14
    168 ZIGZAG_LOOP 15
    169 
    170     movdqa      xmm2, [rsp + qcoeff]
    171     movdqa      xmm3, [rsp + qcoeff + 16]
    172 
    173 %if ABI_IS_32BIT
    174     mov         rdi, arg(1)
    175 %else
    176     mov         rdi, [rsp + BLOCKD_d]
    177 %endif
    178 
    179     mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
    180     mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
    181 
    182     ; y ^ sz
    183     pxor        xmm2, xmm0
    184     pxor        xmm3, xmm4
    185     ; x = (y ^ sz) - sz
    186     psubw       xmm2, xmm0
    187     psubw       xmm3, xmm4
    188 
    189     ; dequant
    190     movdqa      xmm0, [rcx]
    191     movdqa      xmm1, [rcx + 16]
    192 
    193     mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr
    194 
    195     pmullw      xmm0, xmm2
    196     pmullw      xmm1, xmm3
    197 
    198     movdqa      [rcx], xmm2        ; store qcoeff
    199     movdqa      [rcx + 16], xmm3
    200     movdqa      [rsi], xmm0        ; store dqcoeff
    201     movdqa      [rsi + 16], xmm1
    202 
    203     ; select the last value (in zig_zag order) for EOB
    204     pcmpeqw     xmm2, xmm6
    205     pcmpeqw     xmm3, xmm6
    206     ; !
    207     pcmpeqw     xmm6, xmm6
    208     pxor        xmm2, xmm6
    209     pxor        xmm3, xmm6
    210     ; mask inv_zig_zag
    211     pand        xmm2, [GLOBAL(inv_zig_zag)]
    212     pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
    213     ; select the max value
    214     pmaxsw      xmm2, xmm3
    215     pshufd      xmm3, xmm2, 00001110b
    216     pmaxsw      xmm2, xmm3
    217     pshuflw     xmm3, xmm2, 00001110b
    218     pmaxsw      xmm2, xmm3
    219     pshuflw     xmm3, xmm2, 00000001b
    220     pmaxsw      xmm2, xmm3
    221     movd        eax, xmm2
    222     and         eax, 0xff
    223     mov         [rdi + vp8_blockd_eob], eax
    224 
    225     ; begin epilog
    226     add         rsp, stack_size
    227     pop         rsp
    228 %if ABI_IS_32BIT
    229     pop         rdi
    230 %else
    231   %ifidn __OUTPUT_FORMAT__,x64
    232     pop         rdi
    233   %endif
    234 %endif
    235     pop         rsi
    236     RESTORE_GOT
    237     RESTORE_XMM
    238     pop         rbp
    239     ret
    240 
    241 ; int vp8_fast_quantize_b_impl_sse2 | arg
    242 ;  (short *coeff_ptr,               |  0
    243 ;   short *qcoeff_ptr,              |  1
    244 ;   short *dequant_ptr,             |  2
    245 ;   short *inv_scan_order,          |  3
    246 ;   short *round_ptr,               |  4
    247 ;   short *quant_ptr,               |  5
    248 ;   short *dqcoeff_ptr)             |  6
    249 
    250 global sym(vp8_fast_quantize_b_impl_sse2)
    251 sym(vp8_fast_quantize_b_impl_sse2):
    252     push        rbp
    253     mov         rbp, rsp
    254     SHADOW_ARGS_TO_STACK 7
    255     push        rsi
    256     push        rdi
    257     ; end prolog
    258 
    259     mov         rdx, arg(0)                 ;coeff_ptr
    260     mov         rcx, arg(2)                 ;dequant_ptr
    261     mov         rdi, arg(4)                 ;round_ptr
    262     mov         rsi, arg(5)                 ;quant_ptr
    263 
    264     movdqa      xmm0, XMMWORD PTR[rdx]
    265     movdqa      xmm4, XMMWORD PTR[rdx + 16]
    266 
    267     movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
    268     movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
    269 
    270     movdqa      xmm1, xmm0
    271     movdqa      xmm5, xmm4
    272 
    273     psraw       xmm0, 15                    ;sign of z (aka sz)
    274     psraw       xmm4, 15                    ;sign of z (aka sz)
    275 
    276     pxor        xmm1, xmm0
    277     pxor        xmm5, xmm4
    278     psubw       xmm1, xmm0                  ;x = abs(z)
    279     psubw       xmm5, xmm4                  ;x = abs(z)
    280 
    281     paddw       xmm1, xmm2
    282     paddw       xmm5, xmm3
    283 
    284     pmulhw      xmm1, XMMWORD PTR[rsi]
    285     pmulhw      xmm5, XMMWORD PTR[rsi + 16]
    286 
    287     mov         rdi, arg(1)                 ;qcoeff_ptr
    288     mov         rsi, arg(6)                 ;dqcoeff_ptr
    289 
    290     movdqa      xmm2, XMMWORD PTR[rcx]
    291     movdqa      xmm3, XMMWORD PTR[rcx + 16]
    292 
    293     pxor        xmm1, xmm0
    294     pxor        xmm5, xmm4
    295     psubw       xmm1, xmm0
    296     psubw       xmm5, xmm4
    297 
    298     movdqa      XMMWORD PTR[rdi], xmm1
    299     movdqa      XMMWORD PTR[rdi + 16], xmm5
    300 
    301     pmullw      xmm2, xmm1
    302     pmullw      xmm3, xmm5
    303 
    304     mov         rdi, arg(3)                 ;inv_scan_order
    305 
    306     ; Start with 16
    307     pxor        xmm4, xmm4                  ;clear all bits
    308     pcmpeqw     xmm1, xmm4
    309     pcmpeqw     xmm5, xmm4
    310 
    311     pcmpeqw     xmm4, xmm4                  ;set all bits
    312     pxor        xmm1, xmm4
    313     pxor        xmm5, xmm4
    314 
    315     pand        xmm1, XMMWORD PTR[rdi]
    316     pand        xmm5, XMMWORD PTR[rdi+16]
    317 
    318     pmaxsw      xmm1, xmm5
    319 
    320     ; now down to 8
    321     pshufd      xmm5, xmm1, 00001110b
    322 
    323     pmaxsw      xmm1, xmm5
    324 
    325     ; only 4 left
    326     pshuflw     xmm5, xmm1, 00001110b
    327 
    328     pmaxsw      xmm1, xmm5
    329 
    330     ; okay, just 2!
    331     pshuflw     xmm5, xmm1, 00000001b
    332 
    333     pmaxsw      xmm1, xmm5
    334 
    335     movd        rax, xmm1
    336     and         rax, 0xff
    337 
    338     movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
    339     movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
    340 
    341     ; begin epilog
    342     pop         rdi
    343     pop         rsi
    344     UNSHADOW_ARGS
    345     pop         rbp
    346     ret
    347 
    348 SECTION_RODATA
    349 align 16
    350 zig_zag:
    351   dw 0x0000, 0x0001, 0x0004, 0x0008
    352   dw 0x0005, 0x0002, 0x0003, 0x0006
    353   dw 0x0009, 0x000c, 0x000d, 0x000a
    354   dw 0x0007, 0x000b, 0x000e, 0x000f
    355 inv_zig_zag:
    356   dw 0x0001, 0x0002, 0x0006, 0x0007
    357   dw 0x0003, 0x0005, 0x0008, 0x000d
    358   dw 0x0004, 0x0009, 0x000c, 0x000e
    359   dw 0x000a, 0x000b, 0x000f, 0x0010
    360