Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jidctflt.asm - floating-point IDCT (SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; This file contains a floating-point implementation of the inverse DCT
     17 ; (Discrete Cosine Transform). The following code is based directly on
     18 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     19 ;
     20 ; [TAB8]
     21 
     22 %include "jsimdext.inc"
     23 %include "jdct.inc"
     24 
     25 ; --------------------------------------------------------------------------
     26 
     27 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     28         shufps  %1,%2,0x44
     29 %endmacro
     30 
     31 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     32         shufps  %1,%2,0xEE
     33 %endmacro
     34 
     35 ; --------------------------------------------------------------------------
     36         SECTION SEG_CONST
     37 
     38         alignz  16
     39         global  EXTN(jconst_idct_float_sse2)
     40 
     41 EXTN(jconst_idct_float_sse2):
     42 
     43 PD_1_414        times 4 dd  1.414213562373095048801689
     44 PD_1_847        times 4 dd  1.847759065022573512256366
     45 PD_1_082        times 4 dd  1.082392200292393968799446
     46 PD_M2_613       times 4 dd -2.613125929752753055713286
     47 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
     48 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
     49 
     50         alignz  16
     51 
     52 ; --------------------------------------------------------------------------
     53         SECTION SEG_TEXT
     54         BITS    32
     55 ;
     56 ; Perform dequantization and inverse DCT on one block of coefficients.
     57 ;
     58 ; GLOBAL(void)
     59 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
     60 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
     61 ;
     62 
     63 %define dct_table(b)    (b)+8           ; void *dct_table
     64 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
     65 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
     66 %define output_col(b)   (b)+20          ; JDIMENSION output_col
     67 
     68 %define original_ebp    ebp+0
     69 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     70 %define WK_NUM          2
     71 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     72                                         ; FAST_FLOAT workspace[DCTSIZE2]
     73 
     74         align   16
     75         global  EXTN(jsimd_idct_float_sse2)
     76 
     77 EXTN(jsimd_idct_float_sse2):
     78         push    ebp
     79         mov     eax,esp                         ; eax = original ebp
     80         sub     esp, byte 4
     81         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     82         mov     [esp],eax
     83         mov     ebp,esp                         ; ebp = aligned ebp
     84         lea     esp, [workspace]
     85         push    ebx
     86 ;       push    ecx             ; need not be preserved
     87 ;       push    edx             ; need not be preserved
     88         push    esi
     89         push    edi
     90 
     91         get_GOT ebx             ; get GOT address
     92 
     93         ; ---- Pass 1: process columns from input, store into work array.
     94 
     95 ;       mov     eax, [original_ebp]
     96         mov     edx, POINTER [dct_table(eax)]           ; quantptr
     97         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
     98         lea     edi, [workspace]                        ; FAST_FLOAT *wsptr
     99         mov     ecx, DCTSIZE/4                          ; ctr
    100         alignx  16,7
    101 .columnloop:
    102 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
    103         mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
    104         or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
    105         jnz     near .columnDCT
    106 
    107         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    108         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    109         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    110         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    111         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    112         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    113         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    114         por     xmm1,xmm2
    115         por     xmm3,xmm4
    116         por     xmm5,xmm6
    117         por     xmm1,xmm3
    118         por     xmm5,xmm7
    119         por     xmm1,xmm5
    120         packsswb xmm1,xmm1
    121         movd    eax,xmm1
    122         test    eax,eax
    123         jnz     short .columnDCT
    124 
    125         ; -- AC terms all zero
    126 
    127         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    128 
    129         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    130         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    131         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    132 
    133         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    134 
    135         movaps  xmm1,xmm0
    136         movaps  xmm2,xmm0
    137         movaps  xmm3,xmm0
    138 
    139         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
    140         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
    141         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
    142         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
    143 
    144         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    145         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
    146         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    147         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
    148         movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
    149         movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    150         movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    151         movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    152         jmp     near .nextcolumn
    153         alignx  16,7
    154 %endif
    155 .columnDCT:
    156 
    157         ; -- Even part
    158 
    159         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
    160         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
    161         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
    162         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
    163 
    164         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    165         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
    166         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    167         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
    168         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    169         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
    170 
    171         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
    172         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
    173         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
    174         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
    175         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
    176         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
    177 
    178         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    179         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    180         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    181         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    182 
    183         movaps  xmm4,xmm0
    184         movaps  xmm5,xmm1
    185         subps   xmm0,xmm2               ; xmm0=tmp11
    186         subps   xmm1,xmm3
    187         addps   xmm4,xmm2               ; xmm4=tmp10
    188         addps   xmm5,xmm3               ; xmm5=tmp13
    189 
    190         mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
    191         subps   xmm1,xmm5               ; xmm1=tmp12
    192 
    193         movaps  xmm6,xmm4
    194         movaps  xmm7,xmm0
    195         subps   xmm4,xmm5               ; xmm4=tmp3
    196         subps   xmm0,xmm1               ; xmm0=tmp2
    197         addps   xmm6,xmm5               ; xmm6=tmp0
    198         addps   xmm7,xmm1               ; xmm7=tmp1
    199 
    200         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    201         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    202 
    203         ; -- Odd part
    204 
    205         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
    206         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
    207         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
    208         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
    209 
    210         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
    211         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
    212         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
    213         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
    214         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
    215         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
    216 
    217         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
    218         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
    219         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
    220         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
    221         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
    222         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
    223 
    224         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    225         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    226         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    227         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
    228 
    229         movaps  xmm4,xmm2
    230         movaps  xmm0,xmm5
    231         addps   xmm2,xmm1               ; xmm2=z11
    232         addps   xmm5,xmm3               ; xmm5=z13
    233         subps   xmm4,xmm1               ; xmm4=z12
    234         subps   xmm0,xmm3               ; xmm0=z10
    235 
    236         movaps  xmm1,xmm2
    237         subps   xmm2,xmm5
    238         addps   xmm1,xmm5               ; xmm1=tmp7
    239 
    240         mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
    241 
    242         movaps  xmm3,xmm0
    243         addps   xmm0,xmm4
    244         mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
    245         mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
    246         mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
    247         addps   xmm3,xmm0               ; xmm3=tmp12
    248         subps   xmm4,xmm0               ; xmm4=tmp10
    249 
    250         ; -- Final output stage
    251 
    252         subps   xmm3,xmm1               ; xmm3=tmp6
    253         movaps  xmm5,xmm6
    254         movaps  xmm0,xmm7
    255         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
    256         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
    257         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
    258         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
    259         subps   xmm2,xmm3               ; xmm2=tmp5
    260 
    261         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
    262         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
    263         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
    264         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
    265         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
    266         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
    267 
    268         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    269         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    270 
    271         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    272         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    273 
    274         addps   xmm4,xmm2               ; xmm4=tmp4
    275         movaps  xmm0,xmm7
    276         movaps  xmm3,xmm5
    277         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
    278         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
    279         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
    280         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
    281 
    282         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
    283         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
    284         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
    285         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
    286         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
    287         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
    288 
    289         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
    290         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
    291         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
    292         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
    293         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
    294         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
    295 
    296         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    297         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    298 
    299         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
    300         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
    301         movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
    302         movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    303 
    304         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
    305         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
    306         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
    307         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
    308         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
    309         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
    310 
    311         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
    312         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    313         movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
    314         movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
    315 
    316 .nextcolumn:
    317         add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
    318         add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
    319         add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
    320         dec     ecx                                     ; ctr
    321         jnz     near .columnloop
    322 
    323         ; -- Prefetch the next coefficient block
    324 
    325         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    326         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    327         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    328         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    329 
    330         ; ---- Pass 2: process rows from work array, store into output array.
    331 
    332         mov     eax, [original_ebp]
    333         lea     esi, [workspace]                        ; FAST_FLOAT *wsptr
    334         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
    335         mov     eax, JDIMENSION [output_col(eax)]
    336         mov     ecx, DCTSIZE/4                          ; ctr
    337         alignx  16,7
    338 .rowloop:
    339 
    340         ; -- Even part
    341 
    342         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    343         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
    344         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
    345         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
    346 
    347         movaps  xmm4,xmm0
    348         movaps  xmm5,xmm1
    349         subps   xmm0,xmm2               ; xmm0=tmp11
    350         subps   xmm1,xmm3
    351         addps   xmm4,xmm2               ; xmm4=tmp10
    352         addps   xmm5,xmm3               ; xmm5=tmp13
    353 
    354         mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
    355         subps   xmm1,xmm5               ; xmm1=tmp12
    356 
    357         movaps  xmm6,xmm4
    358         movaps  xmm7,xmm0
    359         subps   xmm4,xmm5               ; xmm4=tmp3
    360         subps   xmm0,xmm1               ; xmm0=tmp2
    361         addps   xmm6,xmm5               ; xmm6=tmp0
    362         addps   xmm7,xmm1               ; xmm7=tmp1
    363 
    364         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    365         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    366 
    367         ; -- Odd part
    368 
    369         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    370         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
    371         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
    372         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
    373 
    374         movaps  xmm4,xmm2
    375         movaps  xmm0,xmm5
    376         addps   xmm2,xmm1               ; xmm2=z11
    377         addps   xmm5,xmm3               ; xmm5=z13
    378         subps   xmm4,xmm1               ; xmm4=z12
    379         subps   xmm0,xmm3               ; xmm0=z10
    380 
    381         movaps  xmm1,xmm2
    382         subps   xmm2,xmm5
    383         addps   xmm1,xmm5               ; xmm1=tmp7
    384 
    385         mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
    386 
    387         movaps  xmm3,xmm0
    388         addps   xmm0,xmm4
    389         mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
    390         mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
    391         mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
    392         addps   xmm3,xmm0               ; xmm3=tmp12
    393         subps   xmm4,xmm0               ; xmm4=tmp10
    394 
    395         ; -- Final output stage
    396 
    397         subps   xmm3,xmm1               ; xmm3=tmp6
    398         movaps  xmm5,xmm6
    399         movaps  xmm0,xmm7
    400         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
    401         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
    402         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
    403         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
    404         subps   xmm2,xmm3               ; xmm2=tmp5
    405 
    406         movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
    407         pcmpeqd xmm3,xmm3
    408         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    409 
    410         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    411         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    412         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    413         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    414 
    415         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
    416         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
    417         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
    418         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
    419         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
    420         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
    421 
    422         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
    423         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
    424 
    425         addps   xmm4,xmm2               ; xmm4=tmp4
    426         movaps  xmm7,xmm1
    427         movaps  xmm5,xmm3
    428         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
    429         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
    430         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
    431         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
    432 
    433         movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
    434         pcmpeqd xmm4,xmm4
    435         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    436 
    437         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    438         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    439         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    440         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    441 
    442         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
    443         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
    444         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
    445         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
    446         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
    447         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
    448 
    449         movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
    450 
    451         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    452         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    453         paddb     xmm6,xmm2
    454         paddb     xmm1,xmm2
    455 
    456         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
    457         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    458         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    459 
    460         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
    461         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    462         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    463 
    464         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    465         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    466 
    467         pushpic ebx                     ; save GOT address
    468 
    469         mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
    470         mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
    471         movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
    472         movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
    473         mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
    474         mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
    475         movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
    476         movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
    477 
    478         poppic  ebx                     ; restore GOT address
    479 
    480         add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
    481         add     edi, byte 4*SIZEOF_JSAMPROW
    482         dec     ecx                             ; ctr
    483         jnz     near .rowloop
    484 
    485         pop     edi
    486         pop     esi
    487 ;       pop     edx             ; need not be preserved
    488 ;       pop     ecx             ; need not be preserved
    489         pop     ebx
    490         mov     esp,ebp         ; esp <- aligned ebp
    491         pop     esp             ; esp <- original ebp
    492         pop     ebp
    493         ret
    494 
    495 ; For some reason, the OS X linker does not honor the request to align the
    496 ; segment unless we do this.
    497         align   16
    498