Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jidctred.asm - reduced-size IDCT (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains inverse-DCT routines that produce reduced-size
     18 ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block.
     19 ; The following code is based directly on the IJG's original jidctred.c;
     20 ; see the jidctred.c for more details.
     21 ;
     22 ; [TAB8]
     23 
     24 %include "jsimdext.inc"
     25 %include "jdct.inc"
     26 
     27 ; --------------------------------------------------------------------------
     28 
     29 %define CONST_BITS      13
     30 %define PASS1_BITS      2
     31 
     32 %define DESCALE_P1_4    (CONST_BITS-PASS1_BITS+1)
     33 %define DESCALE_P2_4    (CONST_BITS+PASS1_BITS+3+1)
     34 %define DESCALE_P1_2    (CONST_BITS-PASS1_BITS+2)
     35 %define DESCALE_P2_2    (CONST_BITS+PASS1_BITS+3+2)
     36 
     37 %if CONST_BITS == 13
     38 F_0_211 equ      1730           ; FIX(0.211164243)
     39 F_0_509 equ      4176           ; FIX(0.509795579)
     40 F_0_601 equ      4926           ; FIX(0.601344887)
     41 F_0_720 equ      5906           ; FIX(0.720959822)
     42 F_0_765 equ      6270           ; FIX(0.765366865)
     43 F_0_850 equ      6967           ; FIX(0.850430095)
     44 F_0_899 equ      7373           ; FIX(0.899976223)
     45 F_1_061 equ      8697           ; FIX(1.061594337)
     46 F_1_272 equ     10426           ; FIX(1.272758580)
     47 F_1_451 equ     11893           ; FIX(1.451774981)
     48 F_1_847 equ     15137           ; FIX(1.847759065)
     49 F_2_172 equ     17799           ; FIX(2.172734803)
     50 F_2_562 equ     20995           ; FIX(2.562915447)
     51 F_3_624 equ     29692           ; FIX(3.624509785)
     52 %else
     53 ; NASM cannot do compile-time arithmetic on floating-point constants.
     54 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
     55 F_0_211 equ     DESCALE( 226735879,30-CONST_BITS)       ; FIX(0.211164243)
     56 F_0_509 equ     DESCALE( 547388834,30-CONST_BITS)       ; FIX(0.509795579)
     57 F_0_601 equ     DESCALE( 645689155,30-CONST_BITS)       ; FIX(0.601344887)
     58 F_0_720 equ     DESCALE( 774124714,30-CONST_BITS)       ; FIX(0.720959822)
     59 F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
     60 F_0_850 equ     DESCALE( 913142361,30-CONST_BITS)       ; FIX(0.850430095)
     61 F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
     62 F_1_061 equ     DESCALE(1139878239,30-CONST_BITS)       ; FIX(1.061594337)
     63 F_1_272 equ     DESCALE(1366614119,30-CONST_BITS)       ; FIX(1.272758580)
     64 F_1_451 equ     DESCALE(1558831516,30-CONST_BITS)       ; FIX(1.451774981)
     65 F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
     66 F_2_172 equ     DESCALE(2332956230,30-CONST_BITS)       ; FIX(2.172734803)
     67 F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
     68 F_3_624 equ     DESCALE(3891787747,30-CONST_BITS)       ; FIX(3.624509785)
     69 %endif
     70 
     71 ; --------------------------------------------------------------------------
     72         SECTION SEG_CONST
     73 
     74         alignz  16
     75         global  EXTN(jconst_idct_red_sse2)
     76 
     77 EXTN(jconst_idct_red_sse2):
     78 
     79 PW_F184_MF076   times 4 dw  F_1_847,-F_0_765
     80 PW_F256_F089    times 4 dw  F_2_562, F_0_899
     81 PW_F106_MF217   times 4 dw  F_1_061,-F_2_172
     82 PW_MF060_MF050  times 4 dw -F_0_601,-F_0_509
     83 PW_F145_MF021   times 4 dw  F_1_451,-F_0_211
     84 PW_F362_MF127   times 4 dw  F_3_624,-F_1_272
     85 PW_F085_MF072   times 4 dw  F_0_850,-F_0_720
     86 PD_DESCALE_P1_4 times 4 dd  1 << (DESCALE_P1_4-1)
     87 PD_DESCALE_P2_4 times 4 dd  1 << (DESCALE_P2_4-1)
     88 PD_DESCALE_P1_2 times 4 dd  1 << (DESCALE_P1_2-1)
     89 PD_DESCALE_P2_2 times 4 dd  1 << (DESCALE_P2_2-1)
     90 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
     91 
     92         alignz  16
     93 
     94 ; --------------------------------------------------------------------------
     95         SECTION SEG_TEXT
     96         BITS    64
     97 ;
     98 ; Perform dequantization and inverse DCT on one block of coefficients,
     99 ; producing a reduced-size 4x4 output block.
    100 ;
    101 ; GLOBAL(void)
    102 ; jsimd_idct_4x4_sse2 (void *dct_table, JCOEFPTR coef_block,
    103 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
    104 ;
    105 
    106 ; r10 = void *dct_table
    107 ; r11 = JCOEFPTR coef_block
    108 ; r12 = JSAMPARRAY output_buf
    109 ; r13 = JDIMENSION output_col
    110 
    111 %define original_rbp    rbp+0
    112 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
    113 %define WK_NUM          2
    114 
    115         align   16
    116         global  EXTN(jsimd_idct_4x4_sse2)
    117 
    118 EXTN(jsimd_idct_4x4_sse2):
    119         push    rbp
    120         mov     rax,rsp                         ; rax = original rbp
    121         sub     rsp, byte 4
    122         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
    123         mov     [rsp],rax
    124         mov     rbp,rsp                         ; rbp = aligned rbp
    125         lea     rsp, [wk(0)]
    126         collect_args
    127 
    128         ; ---- Pass 1: process columns from input.
    129 
    130         mov     rdx, r10                ; quantptr
    131         mov     rsi, r11                ; inptr
    132 
    133 %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2
    134         mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    135         or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    136         jnz     short .columnDCT
    137 
    138         movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    139         movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    140         por     xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    141         por     xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    142         por     xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    143         por     xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    144         por     xmm0,xmm1
    145         packsswb xmm0,xmm0
    146         packsswb xmm0,xmm0
    147         movd    eax,xmm0
    148         test    rax,rax
    149         jnz     short .columnDCT
    150 
    151         ; -- AC terms all zero
    152 
    153         movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    154         pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    155 
    156         psllw   xmm0,PASS1_BITS
    157 
    158         movdqa    xmm3,xmm0     ; xmm0=in0=(00 01 02 03 04 05 06 07)
    159         punpcklwd xmm0,xmm0     ; xmm0=(00 00 01 01 02 02 03 03)
    160         punpckhwd xmm3,xmm3     ; xmm3=(04 04 05 05 06 06 07 07)
    161 
    162         pshufd  xmm1,xmm0,0x50  ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01)
    163         pshufd  xmm0,xmm0,0xFA  ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03)
    164         pshufd  xmm6,xmm3,0x50  ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05)
    165         pshufd  xmm3,xmm3,0xFA  ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07)
    166 
    167         jmp     near .column_end
    168 %endif
    169 .columnDCT:
    170 
    171         ; -- Odd part
    172 
    173         movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    174         movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    175         pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    176         pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    177         movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    178         movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    179         pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    180         pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    181 
    182         movdqa    xmm4,xmm0
    183         movdqa    xmm5,xmm0
    184         punpcklwd xmm4,xmm1
    185         punpckhwd xmm5,xmm1
    186         movdqa    xmm0,xmm4
    187         movdqa    xmm1,xmm5
    188         pmaddwd   xmm4,[rel PW_F256_F089]       ; xmm4=(tmp2L)
    189         pmaddwd   xmm5,[rel PW_F256_F089]       ; xmm5=(tmp2H)
    190         pmaddwd   xmm0,[rel PW_F106_MF217]      ; xmm0=(tmp0L)
    191         pmaddwd   xmm1,[rel PW_F106_MF217]      ; xmm1=(tmp0H)
    192 
    193         movdqa    xmm6,xmm2
    194         movdqa    xmm7,xmm2
    195         punpcklwd xmm6,xmm3
    196         punpckhwd xmm7,xmm3
    197         movdqa    xmm2,xmm6
    198         movdqa    xmm3,xmm7
    199         pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2L)
    200         pmaddwd   xmm7,[rel PW_MF060_MF050]     ; xmm7=(tmp2H)
    201         pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0L)
    202         pmaddwd   xmm3,[rel PW_F145_MF021]      ; xmm3=(tmp0H)
    203 
    204         paddd   xmm6,xmm4               ; xmm6=tmp2L
    205         paddd   xmm7,xmm5               ; xmm7=tmp2H
    206         paddd   xmm2,xmm0               ; xmm2=tmp0L
    207         paddd   xmm3,xmm1               ; xmm3=tmp0H
    208 
    209         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp0L
    210         movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=tmp0H
    211 
    212         ; -- Even part
    213 
    214         movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    215         movdqa  xmm5, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    216         movdqa  xmm0, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    217         pmullw  xmm4, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    218         pmullw  xmm5, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    219         pmullw  xmm0, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    220 
    221         pxor      xmm1,xmm1
    222         pxor      xmm2,xmm2
    223         punpcklwd xmm1,xmm4             ; xmm1=tmp0L
    224         punpckhwd xmm2,xmm4             ; xmm2=tmp0H
    225         psrad     xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1
    226         psrad     xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1
    227 
    228         movdqa    xmm3,xmm5             ; xmm5=in2=z2
    229         punpcklwd xmm5,xmm0             ; xmm0=in6=z3
    230         punpckhwd xmm3,xmm0
    231         pmaddwd   xmm5,[rel PW_F184_MF076]      ; xmm5=tmp2L
    232         pmaddwd   xmm3,[rel PW_F184_MF076]      ; xmm3=tmp2H
    233 
    234         movdqa  xmm4,xmm1
    235         movdqa  xmm0,xmm2
    236         paddd   xmm1,xmm5               ; xmm1=tmp10L
    237         paddd   xmm2,xmm3               ; xmm2=tmp10H
    238         psubd   xmm4,xmm5               ; xmm4=tmp12L
    239         psubd   xmm0,xmm3               ; xmm0=tmp12H
    240 
    241         ; -- Final output stage
    242 
    243         movdqa  xmm5,xmm1
    244         movdqa  xmm3,xmm2
    245         paddd   xmm1,xmm6               ; xmm1=data0L
    246         paddd   xmm2,xmm7               ; xmm2=data0H
    247         psubd   xmm5,xmm6               ; xmm5=data3L
    248         psubd   xmm3,xmm7               ; xmm3=data3H
    249 
    250         movdqa  xmm6,[rel PD_DESCALE_P1_4]      ; xmm6=[rel PD_DESCALE_P1_4]
    251 
    252         paddd   xmm1,xmm6
    253         paddd   xmm2,xmm6
    254         psrad   xmm1,DESCALE_P1_4
    255         psrad   xmm2,DESCALE_P1_4
    256         paddd   xmm5,xmm6
    257         paddd   xmm3,xmm6
    258         psrad   xmm5,DESCALE_P1_4
    259         psrad   xmm3,DESCALE_P1_4
    260 
    261         packssdw  xmm1,xmm2             ; xmm1=data0=(00 01 02 03 04 05 06 07)
    262         packssdw  xmm5,xmm3             ; xmm5=data3=(30 31 32 33 34 35 36 37)
    263 
    264         movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp0L
    265         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp0H
    266 
    267         movdqa  xmm2,xmm4
    268         movdqa  xmm3,xmm0
    269         paddd   xmm4,xmm7               ; xmm4=data1L
    270         paddd   xmm0,xmm6               ; xmm0=data1H
    271         psubd   xmm2,xmm7               ; xmm2=data2L
    272         psubd   xmm3,xmm6               ; xmm3=data2H
    273 
    274         movdqa  xmm7,[rel PD_DESCALE_P1_4]      ; xmm7=[rel PD_DESCALE_P1_4]
    275 
    276         paddd   xmm4,xmm7
    277         paddd   xmm0,xmm7
    278         psrad   xmm4,DESCALE_P1_4
    279         psrad   xmm0,DESCALE_P1_4
    280         paddd   xmm2,xmm7
    281         paddd   xmm3,xmm7
    282         psrad   xmm2,DESCALE_P1_4
    283         psrad   xmm3,DESCALE_P1_4
    284 
    285         packssdw  xmm4,xmm0             ; xmm4=data1=(10 11 12 13 14 15 16 17)
    286         packssdw  xmm2,xmm3             ; xmm2=data2=(20 21 22 23 24 25 26 27)
    287 
    288         movdqa    xmm6,xmm1     ; transpose coefficients(phase 1)
    289         punpcklwd xmm1,xmm4     ; xmm1=(00 10 01 11 02 12 03 13)
    290         punpckhwd xmm6,xmm4     ; xmm6=(04 14 05 15 06 16 07 17)
    291         movdqa    xmm7,xmm2     ; transpose coefficients(phase 1)
    292         punpcklwd xmm2,xmm5     ; xmm2=(20 30 21 31 22 32 23 33)
    293         punpckhwd xmm7,xmm5     ; xmm7=(24 34 25 35 26 36 27 37)
    294 
    295         movdqa    xmm0,xmm1     ; transpose coefficients(phase 2)
    296         punpckldq xmm1,xmm2     ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31)
    297         punpckhdq xmm0,xmm2     ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33)
    298         movdqa    xmm3,xmm6     ; transpose coefficients(phase 2)
    299         punpckldq xmm6,xmm7     ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35)
    300         punpckhdq xmm3,xmm7     ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37)
    301 .column_end:
    302 
    303         ; -- Prefetch the next coefficient block
    304 
    305         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
    306         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
    307         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
    308         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
    309 
    310         ; ---- Pass 2: process rows, store into output array.
    311 
    312         mov     rax, [original_rbp]
    313         mov     rdi, r12        ; (JSAMPROW *)
    314         mov     eax, r13d
    315 
    316         ; -- Even part
    317 
    318         pxor      xmm4,xmm4
    319         punpcklwd xmm4,xmm1             ; xmm4=tmp0
    320         psrad     xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1
    321 
    322         ; -- Odd part
    323 
    324         punpckhwd xmm1,xmm0
    325         punpckhwd xmm6,xmm3
    326         movdqa    xmm5,xmm1
    327         movdqa    xmm2,xmm6
    328         pmaddwd   xmm1,[rel PW_F256_F089]       ; xmm1=(tmp2)
    329         pmaddwd   xmm6,[rel PW_MF060_MF050]     ; xmm6=(tmp2)
    330         pmaddwd   xmm5,[rel PW_F106_MF217]      ; xmm5=(tmp0)
    331         pmaddwd   xmm2,[rel PW_F145_MF021]      ; xmm2=(tmp0)
    332 
    333         paddd     xmm6,xmm1             ; xmm6=tmp2
    334         paddd     xmm2,xmm5             ; xmm2=tmp0
    335 
    336         ; -- Even part
    337 
    338         punpcklwd xmm0,xmm3
    339         pmaddwd   xmm0,[rel PW_F184_MF076]      ; xmm0=tmp2
    340 
    341         movdqa    xmm7,xmm4
    342         paddd     xmm4,xmm0             ; xmm4=tmp10
    343         psubd     xmm7,xmm0             ; xmm7=tmp12
    344 
    345         ; -- Final output stage
    346 
    347         movdqa  xmm1,[rel PD_DESCALE_P2_4]      ; xmm1=[rel PD_DESCALE_P2_4]
    348 
    349         movdqa  xmm5,xmm4
    350         movdqa  xmm3,xmm7
    351         paddd   xmm4,xmm6               ; xmm4=data0=(00 10 20 30)
    352         paddd   xmm7,xmm2               ; xmm7=data1=(01 11 21 31)
    353         psubd   xmm5,xmm6               ; xmm5=data3=(03 13 23 33)
    354         psubd   xmm3,xmm2               ; xmm3=data2=(02 12 22 32)
    355 
    356         paddd   xmm4,xmm1
    357         paddd   xmm7,xmm1
    358         psrad   xmm4,DESCALE_P2_4
    359         psrad   xmm7,DESCALE_P2_4
    360         paddd   xmm5,xmm1
    361         paddd   xmm3,xmm1
    362         psrad   xmm5,DESCALE_P2_4
    363         psrad   xmm3,DESCALE_P2_4
    364 
    365         packssdw  xmm4,xmm3             ; xmm4=(00 10 20 30 02 12 22 32)
    366         packssdw  xmm7,xmm5             ; xmm7=(01 11 21 31 03 13 23 33)
    367 
    368         movdqa    xmm0,xmm4             ; transpose coefficients(phase 1)
    369         punpcklwd xmm4,xmm7             ; xmm4=(00 01 10 11 20 21 30 31)
    370         punpckhwd xmm0,xmm7             ; xmm0=(02 03 12 13 22 23 32 33)
    371 
    372         movdqa    xmm6,xmm4             ; transpose coefficients(phase 2)
    373         punpckldq xmm4,xmm0             ; xmm4=(00 01 02 03 10 11 12 13)
    374         punpckhdq xmm6,xmm0             ; xmm6=(20 21 22 23 30 31 32 33)
    375 
    376         packsswb  xmm4,xmm6             ; xmm4=(00 01 02 03 10 11 12 13 20 ..)
    377         paddb     xmm4,[rel PB_CENTERJSAMP]
    378 
    379         pshufd    xmm2,xmm4,0x39        ; xmm2=(10 11 12 13 20 21 22 23 30 ..)
    380         pshufd    xmm1,xmm4,0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
    381         pshufd    xmm3,xmm4,0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
    382 
    383         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    384         mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    385         movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
    386         movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
    387         mov     rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    388         mov     rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    389         movd    XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
    390         movd    XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
    391 
    392         uncollect_args
    393         mov     rsp,rbp         ; rsp <- aligned rbp
    394         pop     rsp             ; rsp <- original rbp
    395         pop     rbp
    396         ret
    397 
    398 
    399 ; --------------------------------------------------------------------------
    400 ;
    401 ; Perform dequantization and inverse DCT on one block of coefficients,
    402 ; producing a reduced-size 2x2 output block.
    403 ;
    404 ; GLOBAL(void)
    405 ; jsimd_idct_2x2_sse2 (void *dct_table, JCOEFPTR coef_block,
    406 ;                      JSAMPARRAY output_buf, JDIMENSION output_col)
    407 ;
    408 
    409 ; r10 = void *dct_table
    410 ; r11 = JCOEFPTR coef_block
    411 ; r12 = JSAMPARRAY output_buf
    412 ; r13 = JDIMENSION output_col
    413 
    414         align   16
    415         global  EXTN(jsimd_idct_2x2_sse2)
    416 
    417 EXTN(jsimd_idct_2x2_sse2):
    418         push    rbp
    419         mov     rax,rsp
    420         mov     rbp,rsp
    421         collect_args
    422         push    rbx
    423 
    424         ; ---- Pass 1: process columns from input.
    425 
    426         mov     rdx, r10                ; quantptr
    427         mov     rsi, r11                ; inptr
    428 
    429         ; | input:                  | result:        |
    430         ; | 00 01 ** 03 ** 05 ** 07 |                |
    431         ; | 10 11 ** 13 ** 15 ** 17 |                |
    432         ; | ** ** ** ** ** ** ** ** |                |
    433         ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 |
    434         ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 |
    435         ; | 50 51 ** 53 ** 55 ** 57 |                |
    436         ; | ** ** ** ** ** ** ** ** |                |
    437         ; | 70 71 ** 73 ** 75 ** 77 |                |
    438 
    439         ; -- Odd part
    440 
    441         movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    442         movdqa  xmm1, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    443         pmullw  xmm0, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    444         pmullw  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    445         movdqa  xmm2, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    446         movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    447         pmullw  xmm2, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    448         pmullw  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    449 
    450         ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37)
    451         ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77)
    452 
    453         pcmpeqd   xmm7,xmm7
    454         pslld     xmm7,WORD_BIT         ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..}
    455 
    456         movdqa    xmm4,xmm0             ; xmm4=(10 11 ** 13 ** 15 ** 17)
    457         movdqa    xmm5,xmm2             ; xmm5=(50 51 ** 53 ** 55 ** 57)
    458         punpcklwd xmm4,xmm1             ; xmm4=(10 30 11 31 ** ** 13 33)
    459         punpcklwd xmm5,xmm3             ; xmm5=(50 70 51 71 ** ** 53 73)
    460         pmaddwd   xmm4,[rel PW_F362_MF127]
    461         pmaddwd   xmm5,[rel PW_F085_MF072]
    462 
    463         psrld   xmm0,WORD_BIT           ; xmm0=(11 -- 13 -- 15 -- 17 --)
    464         pand    xmm1,xmm7               ; xmm1=(-- 31 -- 33 -- 35 -- 37)
    465         psrld   xmm2,WORD_BIT           ; xmm2=(51 -- 53 -- 55 -- 57 --)
    466         pand    xmm3,xmm7               ; xmm3=(-- 71 -- 73 -- 75 -- 77)
    467         por     xmm0,xmm1               ; xmm0=(11 31 13 33 15 35 17 37)
    468         por     xmm2,xmm3               ; xmm2=(51 71 53 73 55 75 57 77)
    469         pmaddwd xmm0,[rel PW_F362_MF127]
    470         pmaddwd xmm2,[rel PW_F085_MF072]
    471 
    472         paddd   xmm4,xmm5               ; xmm4=tmp0[col0 col1 **** col3]
    473         paddd   xmm0,xmm2               ; xmm0=tmp0[col1 col3 col5 col7]
    474 
    475         ; -- Even part
    476 
    477         movdqa  xmm6, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    478         pmullw  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    479 
    480         ; xmm6=(00 01 ** 03 ** 05 ** 07)
    481 
    482         movdqa  xmm1,xmm6               ; xmm1=(00 01 ** 03 ** 05 ** 07)
    483         pslld   xmm6,WORD_BIT           ; xmm6=(-- 00 -- ** -- ** -- **)
    484         pand    xmm1,xmm7               ; xmm1=(-- 01 -- 03 -- 05 -- 07)
    485         psrad   xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****]
    486         psrad   xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7]
    487 
    488         ; -- Final output stage
    489 
    490         movdqa  xmm3,xmm6
    491         movdqa  xmm5,xmm1
    492         paddd   xmm6,xmm4       ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **)
    493         paddd   xmm1,xmm0       ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7)
    494         psubd   xmm3,xmm4       ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **)
    495         psubd   xmm5,xmm0       ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7)
    496 
    497         movdqa  xmm2,[rel PD_DESCALE_P1_2]      ; xmm2=[rel PD_DESCALE_P1_2]
    498 
    499         punpckldq  xmm6,xmm3            ; xmm6=(A0 B0 ** **)
    500 
    501         movdqa     xmm7,xmm1
    502         punpcklqdq xmm1,xmm5            ; xmm1=(A1 A3 B1 B3)
    503         punpckhqdq xmm7,xmm5            ; xmm7=(A5 A7 B5 B7)
    504 
    505         paddd   xmm6,xmm2
    506         psrad   xmm6,DESCALE_P1_2
    507 
    508         paddd   xmm1,xmm2
    509         paddd   xmm7,xmm2
    510         psrad   xmm1,DESCALE_P1_2
    511         psrad   xmm7,DESCALE_P1_2
    512 
    513         ; -- Prefetch the next coefficient block
    514 
    515         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
    516         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
    517         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
    518         prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
    519 
    520         ; ---- Pass 2: process rows, store into output array.
    521 
    522         mov     rdi, r12        ; (JSAMPROW *)
    523         mov     eax, r13d
    524 
    525         ; | input:| result:|
    526         ; | A0 B0 |        |
    527         ; | A1 B1 | C0 C1  |
    528         ; | A3 B3 | D0 D1  |
    529         ; | A5 B5 |        |
    530         ; | A7 B7 |        |
    531 
    532         ; -- Odd part
    533 
    534         packssdw  xmm1,xmm1             ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3)
    535         packssdw  xmm7,xmm7             ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7)
    536         pmaddwd   xmm1,[rel PW_F362_MF127]
    537         pmaddwd   xmm7,[rel PW_F085_MF072]
    538 
    539         paddd     xmm1,xmm7             ; xmm1=tmp0[row0 row1 row0 row1]
    540 
    541         ; -- Even part
    542 
    543         pslld     xmm6,(CONST_BITS+2)   ; xmm6=tmp10[row0 row1 **** ****]
    544 
    545         ; -- Final output stage
    546 
    547         movdqa    xmm4,xmm6
    548         paddd     xmm6,xmm1     ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **)
    549         psubd     xmm4,xmm1     ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **)
    550 
    551         punpckldq xmm6,xmm4     ; xmm6=(C0 D0 C1 D1)
    552 
    553         paddd     xmm6,[rel PD_DESCALE_P2_2]
    554         psrad     xmm6,DESCALE_P2_2
    555 
    556         packssdw  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1)
    557         packsswb  xmm6,xmm6             ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..)
    558         paddb     xmm6,[rel PB_CENTERJSAMP]
    559 
    560         pextrw  ebx,xmm6,0x00           ; ebx=(C0 D0 -- --)
    561         pextrw  ecx,xmm6,0x01           ; ecx=(C1 D1 -- --)
    562 
    563         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    564         mov     rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    565         mov     WORD [rdx+rax*SIZEOF_JSAMPLE], bx
    566         mov     WORD [rsi+rax*SIZEOF_JSAMPLE], cx
    567 
    568         pop     rbx
    569         uncollect_args
    570         pop     rbp
    571         ret
    572 
    573 ; For some reason, the OS X linker does not honor the request to align the
    574 ; segment unless we do this.
    575         align   16
    576