Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; This file contains a floating-point implementation of the inverse DCT
     19 ; (Discrete Cosine Transform). The following code is based directly on
     20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
     21 ;
     22 ; [TAB8]
     23 
     24 %include "jsimdext.inc"
     25 %include "jdct.inc"
     26 
     27 ; --------------------------------------------------------------------------
     28 
     29 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     30         shufps  %1,%2,0x44
     31 %endmacro
     32 
     33 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     34         shufps  %1,%2,0xEE
     35 %endmacro
     36 
     37 ; --------------------------------------------------------------------------
     38         SECTION SEG_CONST
     39 
     40         alignz  16
     41         global  EXTN(jconst_idct_float_sse2)
     42 
     43 EXTN(jconst_idct_float_sse2):
     44 
     45 PD_1_414        times 4 dd  1.414213562373095048801689
     46 PD_1_847        times 4 dd  1.847759065022573512256366
     47 PD_1_082        times 4 dd  1.082392200292393968799446
     48 PD_M2_613       times 4 dd -2.613125929752753055713286
     49 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
     50 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
     51 
     52         alignz  16
     53 
     54 ; --------------------------------------------------------------------------
     55         SECTION SEG_TEXT
     56         BITS    64
     57 ;
     58 ; Perform dequantization and inverse DCT on one block of coefficients.
     59 ;
     60 ; GLOBAL(void)
     61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
     62 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
     63 ;
     64 
     65 ; r10 = void * dct_table
     66 ; r11 = JCOEFPTR coef_block
     67 ; r12 = JSAMPARRAY output_buf
     68 ; r13 = JDIMENSION output_col
     69 
     70 %define original_rbp    rbp+0
     71 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     72 %define WK_NUM          2
     73 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
     74                                         ; FAST_FLOAT workspace[DCTSIZE2]
     75 
     76         align   16
     77         global  EXTN(jsimd_idct_float_sse2)
     78 
     79 EXTN(jsimd_idct_float_sse2):
     80         push    rbp
     81         mov     rax,rsp                         ; rax = original rbp
     82         sub     rsp, byte 4
     83         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     84         mov     [rsp],rax
     85         mov     rbp,rsp                         ; rbp = aligned rbp
     86         lea     rsp, [workspace]
     87         collect_args
     88         push    rbx
     89 
     90         ; ---- Pass 1: process columns from input, store into work array.
     91 
     92         mov     rdx, r10                ; quantptr
     93         mov     rsi, r11                ; inptr
     94         lea     rdi, [workspace]                        ; FAST_FLOAT * wsptr
     95         mov     rcx, DCTSIZE/4                          ; ctr
     96 .columnloop:
     97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
     98         mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
     99         or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    100         jnz     near .columnDCT
    101 
    102         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    103         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    104         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    105         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    106         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    107         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    108         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    109         por     xmm1,xmm2
    110         por     xmm3,xmm4
    111         por     xmm5,xmm6
    112         por     xmm1,xmm3
    113         por     xmm5,xmm7
    114         por     xmm1,xmm5
    115         packsswb xmm1,xmm1
    116         movd    eax,xmm1
    117         test    rax,rax
    118         jnz     short .columnDCT
    119 
    120         ; -- AC terms all zero
    121 
    122         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    123 
    124         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    125         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    126         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    127 
    128         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    129 
    130         movaps  xmm1,xmm0
    131         movaps  xmm2,xmm0
    132         movaps  xmm3,xmm0
    133 
    134         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
    135         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
    136         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
    137         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
    138 
    139         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    140         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
    141         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    142         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
    143         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
    144         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
    145         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    146         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    147         jmp     near .nextcolumn
    148 %endif
    149 .columnDCT:
    150 
    151         ; -- Even part
    152 
    153         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    154         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    155         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    156         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    157 
    158         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
    159         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
    160         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
    161         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
    162         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
    163         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
    164 
    165         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
    166         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
    167         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
    168         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
    169         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
    170         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
    171 
    172         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    173         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    174         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    175         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    176 
    177         movaps  xmm4,xmm0
    178         movaps  xmm5,xmm1
    179         subps   xmm0,xmm2               ; xmm0=tmp11
    180         subps   xmm1,xmm3
    181         addps   xmm4,xmm2               ; xmm4=tmp10
    182         addps   xmm5,xmm3               ; xmm5=tmp13
    183 
    184         mulps   xmm1,[rel PD_1_414]
    185         subps   xmm1,xmm5               ; xmm1=tmp12
    186 
    187         movaps  xmm6,xmm4
    188         movaps  xmm7,xmm0
    189         subps   xmm4,xmm5               ; xmm4=tmp3
    190         subps   xmm0,xmm1               ; xmm0=tmp2
    191         addps   xmm6,xmm5               ; xmm6=tmp0
    192         addps   xmm7,xmm1               ; xmm7=tmp1
    193 
    194         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    195         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    196 
    197         ; -- Odd part
    198 
    199         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    200         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    201         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    202         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    203 
    204         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
    205         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
    206         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
    207         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
    208         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
    209         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
    210 
    211         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
    212         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
    213         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
    214         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
    215         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
    216         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
    217 
    218         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    219         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    220         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    221         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
    222 
    223         movaps  xmm4,xmm2
    224         movaps  xmm0,xmm5
    225         addps   xmm2,xmm1               ; xmm2=z11
    226         addps   xmm5,xmm3               ; xmm5=z13
    227         subps   xmm4,xmm1               ; xmm4=z12
    228         subps   xmm0,xmm3               ; xmm0=z10
    229 
    230         movaps  xmm1,xmm2
    231         subps   xmm2,xmm5
    232         addps   xmm1,xmm5               ; xmm1=tmp7
    233 
    234         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
    235 
    236         movaps  xmm3,xmm0
    237         addps   xmm0,xmm4
    238         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
    239         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
    240         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
    241         addps   xmm3,xmm0               ; xmm3=tmp12
    242         subps   xmm4,xmm0               ; xmm4=tmp10
    243 
    244         ; -- Final output stage
    245 
    246         subps   xmm3,xmm1               ; xmm3=tmp6
    247         movaps  xmm5,xmm6
    248         movaps  xmm0,xmm7
    249         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
    250         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
    251         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
    252         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
    253         subps   xmm2,xmm3               ; xmm2=tmp5
    254 
    255         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
    256         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
    257         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
    258         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
    259         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
    260         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
    261 
    262         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
    263         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
    264 
    265         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
    266         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
    267 
    268         addps   xmm4,xmm2               ; xmm4=tmp4
    269         movaps  xmm0,xmm7
    270         movaps  xmm3,xmm5
    271         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
    272         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
    273         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
    274         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
    275 
    276         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
    277         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
    278         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
    279         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
    280         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
    281         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
    282 
    283         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
    284         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
    285         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
    286         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
    287         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
    288         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
    289 
    290         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
    291         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
    292 
    293         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
    294         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
    295         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
    296         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
    297 
    298         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
    299         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
    300         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
    301         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
    302         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
    303         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
    304 
    305         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
    306         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
    307         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
    308         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
    309 
    310 .nextcolumn:
    311         add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
    312         add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
    313         add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
    314         dec     rcx                                     ; ctr
    315         jnz     near .columnloop
    316 
    317         ; -- Prefetch the next coefficient block
    318 
    319         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
    320         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
    321         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
    322         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
    323 
    324         ; ---- Pass 2: process rows from work array, store into output array.
    325 
    326         mov     rax, [original_rbp]
    327         lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
    328         mov     rdi, r12        ; (JSAMPROW *)
    329         mov     eax, r13d
    330         mov     rcx, DCTSIZE/4                          ; ctr
    331 .rowloop:
    332 
    333         ; -- Even part
    334 
    335         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
    336         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
    337         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
    338         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
    339 
    340         movaps  xmm4,xmm0
    341         movaps  xmm5,xmm1
    342         subps   xmm0,xmm2               ; xmm0=tmp11
    343         subps   xmm1,xmm3
    344         addps   xmm4,xmm2               ; xmm4=tmp10
    345         addps   xmm5,xmm3               ; xmm5=tmp13
    346 
    347         mulps   xmm1,[rel PD_1_414]
    348         subps   xmm1,xmm5               ; xmm1=tmp12
    349 
    350         movaps  xmm6,xmm4
    351         movaps  xmm7,xmm0
    352         subps   xmm4,xmm5               ; xmm4=tmp3
    353         subps   xmm0,xmm1               ; xmm0=tmp2
    354         addps   xmm6,xmm5               ; xmm6=tmp0
    355         addps   xmm7,xmm1               ; xmm7=tmp1
    356 
    357         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
    358         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
    359 
    360         ; -- Odd part
    361 
    362         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
    363         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
    364         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
    365         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
    366 
    367         movaps  xmm4,xmm2
    368         movaps  xmm0,xmm5
    369         addps   xmm2,xmm1               ; xmm2=z11
    370         addps   xmm5,xmm3               ; xmm5=z13
    371         subps   xmm4,xmm1               ; xmm4=z12
    372         subps   xmm0,xmm3               ; xmm0=z10
    373 
    374         movaps  xmm1,xmm2
    375         subps   xmm2,xmm5
    376         addps   xmm1,xmm5               ; xmm1=tmp7
    377 
    378         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
    379 
    380         movaps  xmm3,xmm0
    381         addps   xmm0,xmm4
    382         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
    383         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
    384         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
    385         addps   xmm3,xmm0               ; xmm3=tmp12
    386         subps   xmm4,xmm0               ; xmm4=tmp10
    387 
    388         ; -- Final output stage
    389 
    390         subps   xmm3,xmm1               ; xmm3=tmp6
    391         movaps  xmm5,xmm6
    392         movaps  xmm0,xmm7
    393         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
    394         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
    395         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
    396         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
    397         subps   xmm2,xmm3               ; xmm2=tmp5
    398 
    399         movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
    400         pcmpeqd xmm3,xmm3
    401         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    402 
    403         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
    404         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
    405         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
    406         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
    407 
    408         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
    409         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
    410         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
    411         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
    412         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
    413         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
    414 
    415         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
    416         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
    417 
    418         addps   xmm4,xmm2               ; xmm4=tmp4
    419         movaps  xmm7,xmm1
    420         movaps  xmm5,xmm3
    421         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
    422         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
    423         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
    424         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
    425 
    426         movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
    427         pcmpeqd xmm4,xmm4
    428         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
    429 
    430         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
    431         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
    432         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
    433         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
    434 
    435         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
    436         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
    437         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
    438         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
    439         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
    440         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
    441 
    442         movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
    443 
    444         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
    445         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
    446         paddb     xmm6,xmm2
    447         paddb     xmm1,xmm2
    448 
    449         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
    450         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    451         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    452 
    453         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
    454         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    455         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    456 
    457         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    458         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    459 
    460         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    461         mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    462         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
    463         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
    464         mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    465         mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    466         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
    467         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
    468 
    469         add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
    470         add     rdi, byte 4*SIZEOF_JSAMPROW
    471         dec     rcx                             ; ctr
    472         jnz     near .rowloop
    473 
    474         pop     rbx
    475         uncollect_args
    476         mov     rsp,rbp         ; rsp <- aligned rbp
    477         pop     rsp             ; rsp <- original rbp
    478         pop     rbp
    479         ret
    480 
    481 ; For some reason, the OS X linker does not honor the request to align the
    482 ; segment unless we do this.
    483         align   16
    484