Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains a floating-point implementation of the inverse DCT
     18 ; (Discrete Cosine Transform). The following code is based directly on
     19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     29         shufps  %1,%2,0x44
     30 %endmacro
     31 
     32 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     33         shufps  %1,%2,0xEE
     34 %endmacro
     35 
     36 ; --------------------------------------------------------------------------
     37         SECTION SEG_CONST
     38 
     39         alignz  16
     40         global  EXTN(jconst_idct_float_sse2)
     41 
     42 EXTN(jconst_idct_float_sse2):
     43 
     44 PD_1_414        times 4 dd  1.414213562373095048801689
     45 PD_1_847        times 4 dd  1.847759065022573512256366
     46 PD_1_082        times 4 dd  1.082392200292393968799446
     47 PD_M2_613       times 4 dd -2.613125929752753055713286
     48 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
     49 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
     50 
     51         alignz  16
     52 
     53 ; --------------------------------------------------------------------------
     54         SECTION SEG_TEXT
     55         BITS    64
     56 ;
     57 ; Perform dequantization and inverse DCT on one block of coefficients.
     58 ;
     59 ; GLOBAL(void)
     60 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
     61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
     62 ;
     63 
     64 ; r10 = void *dct_table
     65 ; r11 = JCOEFPTR coef_block
     66 ; r12 = JSAMPARRAY output_buf
     67 ; r13 = JDIMENSION output_col
     68 
     69 %define original_rbp    rbp+0
     70 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     71 %define WK_NUM          2
     72 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     73                                         ; FAST_FLOAT workspace[DCTSIZE2]
     74 
     75         align   16
     76         global  EXTN(jsimd_idct_float_sse2)
     77 
     78 EXTN(jsimd_idct_float_sse2):
     79         push    rbp
     80         mov     rax,rsp                         ; rax = original rbp
     81         sub     rsp, byte 4
     82         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     83         mov     [rsp],rax
     84         mov     rbp,rsp                         ; rbp = aligned rbp
     85         lea     rsp, [workspace]
     86         collect_args
     87         push    rbx
     88 
     89         ; ---- Pass 1: process columns from input, store into work array.
     90 
     91         mov     rdx, r10                ; quantptr
     92         mov     rsi, r11                ; inptr
     93         lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
     94         mov     rcx, DCTSIZE/4                          ; ctr
     95 .columnloop:
     96 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     97         mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
     98         or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
     99         jnz     near .columnDCT
    100 
    101         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    102         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    103         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    104         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    105         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    106         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    107         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    108         por     xmm1,xmm2
    109         por     xmm3,xmm4
    110         por     xmm5,xmm6
    111         por     xmm1,xmm3
    112         por     xmm5,xmm7
    113         por     xmm1,xmm5
    114         packsswb xmm1,xmm1
    115         movd    eax,xmm1
    116         test    rax,rax
    117         jnz     short .columnDCT
    118 
    119         ; -- AC terms all zero
    120 
    121         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    122 
    123         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    124         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    125         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    126 
    127         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    128 
    129         movaps  xmm1,xmm0
    130         movaps  xmm2,xmm0
    131         movaps  xmm3,xmm0
    132 
    133         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
    134         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
    135         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
    136         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
    137 
    138         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    139         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
    140         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    141         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
    142         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
    143         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
    144         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    145         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    146         jmp     near .nextcolumn
    147 %endif
    148 .columnDCT:
    149 
    150         ; -- Even part
    151 
    152         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    153         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    154         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    155         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    156 
    157         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    158         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
    159         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    160         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
    161         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    162         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
    163 
    164         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
    165         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
    166         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
    167         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
    168         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
    169         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
    170 
    171         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    172         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    173         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    174         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    175 
    176         movaps  xmm4,xmm0
    177         movaps  xmm5,xmm1
    178         subps   xmm0,xmm2               ; xmm0=tmp11
    179         subps   xmm1,xmm3
    180         addps   xmm4,xmm2               ; xmm4=tmp10
    181         addps   xmm5,xmm3               ; xmm5=tmp13
    182 
    183         mulps   xmm1,[rel PD_1_414]
    184         subps   xmm1,xmm5               ; xmm1=tmp12
    185 
    186         movaps  xmm6,xmm4
    187         movaps  xmm7,xmm0
    188         subps   xmm4,xmm5               ; xmm4=tmp3
    189         subps   xmm0,xmm1               ; xmm0=tmp2
    190         addps   xmm6,xmm5               ; xmm6=tmp0
    191         addps   xmm7,xmm1               ; xmm7=tmp1
    192 
    193         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    194         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    195 
    196         ; -- Odd part
    197 
    198         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    199         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    200         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    201         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    202 
    203         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
    204         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
    205         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
    206         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
    207         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
    208         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
    209 
    210         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
    211         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
    212         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
    213         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
    214         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
    215         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
    216 
    217         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    218         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    219         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    220         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    221 
    222         movaps  xmm4,xmm2
    223         movaps  xmm0,xmm5
    224         addps   xmm2,xmm1               ; xmm2=z11
    225         addps   xmm5,xmm3               ; xmm5=z13
    226         subps   xmm4,xmm1               ; xmm4=z12
    227         subps   xmm0,xmm3               ; xmm0=z10
    228 
    229         movaps  xmm1,xmm2
    230         subps   xmm2,xmm5
    231         addps   xmm1,xmm5               ; xmm1=tmp7
    232 
    233         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
    234 
    235         movaps  xmm3,xmm0
    236         addps   xmm0,xmm4
    237         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
    238         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
    239         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
    240         addps   xmm3,xmm0               ; xmm3=tmp12
    241         subps   xmm4,xmm0               ; xmm4=tmp10
    242 
    243         ; -- Final output stage
    244 
    245         subps   xmm3,xmm1               ; xmm3=tmp6
    246         movaps  xmm5,xmm6
    247         movaps  xmm0,xmm7
    248         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
    249         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
    250         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
    251         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
    252         subps   xmm2,xmm3               ; xmm2=tmp5
    253 
    254         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
    255         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
    256         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
    257         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
    258         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
    259         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
    260 
    261         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    262         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    263 
    264         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    265         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    266 
    267         addps   xmm4,xmm2               ; xmm4=tmp4
    268         movaps  xmm0,xmm7
    269         movaps  xmm3,xmm5
    270         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
    271         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
    272         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
    273         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
    274 
    275         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
    276         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
    277         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
    278         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
    279         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
    280         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
    281 
    282         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
    283         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
    284         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
    285         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
    286         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
    287         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
    288 
    289         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    290         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    291 
    292         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
    293         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    294         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    295         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    296 
    297         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
    298         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
    299         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
    300         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
    301         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
    302         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
    303 
    304         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
    305         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
    306         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
    307         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    308 
    309 .nextcolumn:
    310         add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
    311         add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
    312         add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
    313         dec     rcx                                     ; ctr
    314         jnz     near .columnloop
    315 
    316         ; -- Prefetch the next coefficient block
    317 
    318         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    319         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    320         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    321         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    322 
    323         ; ---- Pass 2: process rows from work array, store into output array.
    324 
    325         mov     rax, [original_rbp]
    326         lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
    327         mov     rdi, r12        ; (JSAMPROW *)
    328         mov     eax, r13d
    329         mov     rcx, DCTSIZE/4                          ; ctr
    330 .rowloop:
    331 
    332         ; -- Even part
    333 
    334         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
    335         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
    336         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
    337         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
    338 
    339         movaps  xmm4,xmm0
    340         movaps  xmm5,xmm1
    341         subps   xmm0,xmm2               ; xmm0=tmp11
    342         subps   xmm1,xmm3
    343         addps   xmm4,xmm2               ; xmm4=tmp10
    344         addps   xmm5,xmm3               ; xmm5=tmp13
    345 
    346         mulps   xmm1,[rel PD_1_414]
    347         subps   xmm1,xmm5               ; xmm1=tmp12
    348 
    349         movaps  xmm6,xmm4
    350         movaps  xmm7,xmm0
    351         subps   xmm4,xmm5               ; xmm4=tmp3
    352         subps   xmm0,xmm1               ; xmm0=tmp2
    353         addps   xmm6,xmm5               ; xmm6=tmp0
    354         addps   xmm7,xmm1               ; xmm7=tmp1
    355 
    356         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    357         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    358 
    359         ; -- Odd part
    360 
    361         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
    362         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
    363         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
    364         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
    365 
    366         movaps  xmm4,xmm2
    367         movaps  xmm0,xmm5
    368         addps   xmm2,xmm1               ; xmm2=z11
    369         addps   xmm5,xmm3               ; xmm5=z13
    370         subps   xmm4,xmm1               ; xmm4=z12
    371         subps   xmm0,xmm3               ; xmm0=z10
    372 
    373         movaps  xmm1,xmm2
    374         subps   xmm2,xmm5
    375         addps   xmm1,xmm5               ; xmm1=tmp7
    376 
    377         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
    378 
    379         movaps  xmm3,xmm0
    380         addps   xmm0,xmm4
    381         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
    382         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
    383         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
    384         addps   xmm3,xmm0               ; xmm3=tmp12
    385         subps   xmm4,xmm0               ; xmm4=tmp10
    386 
    387         ; -- Final output stage
    388 
    389         subps   xmm3,xmm1               ; xmm3=tmp6
    390         movaps  xmm5,xmm6
    391         movaps  xmm0,xmm7
    392         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
    393         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
    394         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
    395         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
    396         subps   xmm2,xmm3               ; xmm2=tmp5
    397 
    398         movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
    399         pcmpeqd xmm3,xmm3
    400         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    401 
    402         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    403         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    404         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    405         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    406 
    407         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
    408         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
    409         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
    410         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
    411         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
    412         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
    413 
    414         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
    415         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
    416 
    417         addps   xmm4,xmm2               ; xmm4=tmp4
    418         movaps  xmm7,xmm1
    419         movaps  xmm5,xmm3
    420         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
    421         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
    422         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
    423         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
    424 
    425         movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
    426         pcmpeqd xmm4,xmm4
    427         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    428 
    429         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    430         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    431         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    432         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    433 
    434         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
    435         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
    436         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
    437         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
    438         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
    439         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
    440 
    441         movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
    442 
    443         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    444         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    445         paddb     xmm6,xmm2
    446         paddb     xmm1,xmm2
    447 
    448         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
    449         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    450         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    451 
    452         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
    453         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    454         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    455 
    456         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    457         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    458 
    459         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    460         mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    461         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
    462         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
    463         mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    464         mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    465         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
    466         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
    467 
    468         add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
    469         add     rdi, byte 4*SIZEOF_JSAMPROW
    470         dec     rcx                             ; ctr
    471         jnz     near .rowloop
    472 
    473         pop     rbx
    474         uncollect_args
    475         mov     rsp,rbp         ; rsp <- aligned rbp
    476         pop     rsp             ; rsp <- original rbp
    477         pop     rbp
    478         ret
    479 
    480 ; For some reason, the OS X linker does not honor the request to align the
    481 ; segment unless we do this.
    482         align   16
    483