Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jfdctfst.asm - fast integer FDCT (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; This file contains a fast, not so accurate integer implementation of
     17 ; the forward DCT (Discrete Cosine Transform). The following code is
     18 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
     19 ; for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %define CONST_BITS      8       ; 14 is also OK.
     29 
     30 %if CONST_BITS == 8
     31 F_0_382 equ      98             ; FIX(0.382683433)
     32 F_0_541 equ     139             ; FIX(0.541196100)
     33 F_0_707 equ     181             ; FIX(0.707106781)
     34 F_1_306 equ     334             ; FIX(1.306562965)
     35 %else
     36 ; NASM cannot do compile-time arithmetic on floating-point constants.
     37 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
     38 F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
     39 F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
     40 F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
     41 F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
     42 %endif
     43 
     44 ; --------------------------------------------------------------------------
     45         SECTION SEG_CONST
     46 
     47 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
     48 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
     49 
     50 %define PRE_MULTIPLY_SCALE_BITS   2
     51 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
     52 
     53         alignz  16
     54         global  EXTN(jconst_fdct_ifast_sse2)
     55 
     56 EXTN(jconst_fdct_ifast_sse2):
     57 
     58 PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
     59 PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
     60 PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
     61 PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
     62 
     63         alignz  16
     64 
     65 ; --------------------------------------------------------------------------
     66         SECTION SEG_TEXT
     67         BITS    32
     68 ;
     69 ; Perform the forward DCT on one block of samples.
     70 ;
     71 ; GLOBAL(void)
     72 ; jsimd_fdct_ifast_sse2 (DCTELEM *data)
     73 ;
     74 
     75 %define data(b)         (b)+8           ; DCTELEM *data
     76 
     77 %define original_ebp    ebp+0
     78 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     79 %define WK_NUM          2
     80 
     81         align   16
     82         global  EXTN(jsimd_fdct_ifast_sse2)
     83 
     84 EXTN(jsimd_fdct_ifast_sse2):
     85         push    ebp
     86         mov     eax,esp                         ; eax = original ebp
     87         sub     esp, byte 4
     88         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     89         mov     [esp],eax
     90         mov     ebp,esp                         ; ebp = aligned ebp
     91         lea     esp, [wk(0)]
     92         pushpic ebx
     93 ;       push    ecx             ; unused
     94 ;       push    edx             ; need not be preserved
     95 ;       push    esi             ; unused
     96 ;       push    edi             ; unused
     97 
     98         get_GOT ebx             ; get GOT address
     99 
    100         ; ---- Pass 1: process rows.
    101 
    102         mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
    103 
    104         movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
    105         movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
    106         movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
    107         movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
    108 
    109         ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
    110         ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
    111 
    112         movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
    113         punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
    114         punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
    115         movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
    116         punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
    117         punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
    118 
    119         movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
    120         movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
    121         movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
    122         movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
    123 
    124         ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
    125         ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
    126 
    127         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
    128         movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
    129 
    130         movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
    131         punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
    132         punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
    133         movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
    134         punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
    135         punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
    136 
    137         movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
    138         punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
    139         punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
    140         movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
    141         punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
    142         punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
    143 
    144         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
    145         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
    146         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
    147         movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
    148 
    149         movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
    150         punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
    151         punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
    152         movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
    153         punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
    154         punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
    155 
    156         movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
    157         punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
    158         punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
    159         movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
    160         punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
    161         punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
    162 
    163         movdqa  xmm6,xmm1
    164         movdqa  xmm3,xmm0
    165         psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
    166         psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
    167         paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
    168         paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
    169 
    170         movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
    171         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
    172         movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
    173         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
    174 
    175         movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
    176         punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
    177         punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
    178         movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
    179         punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
    180         punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
    181 
    182         movdqa  xmm2,xmm1
    183         movdqa  xmm5,xmm7
    184         paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
    185         paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
    186         psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
    187         psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
    188 
    189         ; -- Even part
    190 
    191         movdqa  xmm4,xmm3
    192         movdqa  xmm0,xmm6
    193         psubw   xmm3,xmm1               ; xmm3=tmp13
    194         psubw   xmm6,xmm7               ; xmm6=tmp12
    195         paddw   xmm4,xmm1               ; xmm4=tmp10
    196         paddw   xmm0,xmm7               ; xmm0=tmp11
    197 
    198         paddw   xmm6,xmm3
    199         psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
    200         pmulhw  xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1
    201 
    202         movdqa  xmm1,xmm4
    203         movdqa  xmm7,xmm3
    204         psubw   xmm4,xmm0               ; xmm4=data4
    205         psubw   xmm3,xmm6               ; xmm3=data6
    206         paddw   xmm1,xmm0               ; xmm1=data0
    207         paddw   xmm7,xmm6               ; xmm7=data2
    208 
    209         movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
    210         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
    211         movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
    212         movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
    213 
    214         ; -- Odd part
    215 
    216         paddw   xmm2,xmm5               ; xmm2=tmp10
    217         paddw   xmm5,xmm0               ; xmm5=tmp11
    218         paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
    219 
    220         psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
    221         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
    222 
    223         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
    224         pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3
    225 
    226         movdqa  xmm4,xmm2               ; xmm4=tmp10
    227         psubw   xmm2,xmm0
    228         pmulhw  xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5
    229         pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
    230         pmulhw  xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
    231         paddw   xmm4,xmm2               ; xmm4=z2
    232         paddw   xmm0,xmm2               ; xmm0=z4
    233 
    234         movdqa  xmm3,xmm6
    235         psubw   xmm6,xmm5               ; xmm6=z13
    236         paddw   xmm3,xmm5               ; xmm3=z11
    237 
    238         movdqa  xmm2,xmm6
    239         movdqa  xmm5,xmm3
    240         psubw   xmm6,xmm4               ; xmm6=data3
    241         psubw   xmm3,xmm0               ; xmm3=data7
    242         paddw   xmm2,xmm4               ; xmm2=data5
    243         paddw   xmm5,xmm0               ; xmm5=data1
    244 
    245         ; ---- Pass 2: process columns.
    246 
    247 ;       mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
    248 
    249         ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
    250         ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
    251 
    252         movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
    253         punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
    254         punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
    255         movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
    256         punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
    257         punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
    258 
    259         movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
    260         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
    261 
    262         ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
    263         ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
    264 
    265         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
    266         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
    267 
    268         movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
    269         punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
    270         punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
    271         movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
    272         punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
    273         punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
    274 
    275         movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
    276         punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
    277         punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
    278         movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
    279         punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
    280         punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
    281 
    282         movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
    283         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
    284         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
    285         movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
    286 
    287         movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
    288         punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
    289         punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
    290         movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
    291         punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
    292         punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
    293 
    294         movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
    295         punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
    296         punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
    297         movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
    298         punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
    299         punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
    300 
    301         movdqa  xmm5,xmm6
    302         movdqa  xmm3,xmm1
    303         psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
    304         psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
    305         paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
    306         paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
    307 
    308         movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
    309         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
    310         movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
    311         movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
    312 
    313         movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
    314         punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
    315         punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
    316         movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
    317         punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
    318         punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
    319 
    320         movdqa  xmm7,xmm6
    321         movdqa  xmm0,xmm2
    322         paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
    323         paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
    324         psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
    325         psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
    326 
    327         ; -- Even part
    328 
    329         movdqa  xmm4,xmm3
    330         movdqa  xmm1,xmm5
    331         psubw   xmm3,xmm6               ; xmm3=tmp13
    332         psubw   xmm5,xmm2               ; xmm5=tmp12
    333         paddw   xmm4,xmm6               ; xmm4=tmp10
    334         paddw   xmm1,xmm2               ; xmm1=tmp11
    335 
    336         paddw   xmm5,xmm3
    337         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
    338         pmulhw  xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1
    339 
    340         movdqa  xmm6,xmm4
    341         movdqa  xmm2,xmm3
    342         psubw   xmm4,xmm1               ; xmm4=data4
    343         psubw   xmm3,xmm5               ; xmm3=data6
    344         paddw   xmm6,xmm1               ; xmm6=data0
    345         paddw   xmm2,xmm5               ; xmm2=data2
    346 
    347         movdqa  XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4
    348         movdqa  XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3
    349         movdqa  XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6
    350         movdqa  XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2
    351 
    352         ; -- Odd part
    353 
    354         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
    355         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
    356 
    357         paddw   xmm7,xmm0               ; xmm7=tmp10
    358         paddw   xmm0,xmm1               ; xmm0=tmp11
    359         paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
    360 
    361         psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
    362         psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
    363 
    364         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
    365         pmulhw  xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3
    366 
    367         movdqa  xmm4,xmm7               ; xmm4=tmp10
    368         psubw   xmm7,xmm1
    369         pmulhw  xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5
    370         pmulhw  xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
    371         pmulhw  xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
    372         paddw   xmm4,xmm7               ; xmm4=z2
    373         paddw   xmm1,xmm7               ; xmm1=z4
    374 
    375         movdqa  xmm3,xmm5
    376         psubw   xmm5,xmm0               ; xmm5=z13
    377         paddw   xmm3,xmm0               ; xmm3=z11
    378 
    379         movdqa  xmm6,xmm5
    380         movdqa  xmm2,xmm3
    381         psubw   xmm5,xmm4               ; xmm5=data3
    382         psubw   xmm3,xmm1               ; xmm3=data7
    383         paddw   xmm6,xmm4               ; xmm6=data5
    384         paddw   xmm2,xmm1               ; xmm2=data1
    385 
    386         movdqa  XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5
    387         movdqa  XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3
    388         movdqa  XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6
    389         movdqa  XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2
    390 
    391 ;       pop     edi             ; unused
    392 ;       pop     esi             ; unused
    393 ;       pop     edx             ; need not be preserved
    394 ;       pop     ecx             ; unused
    395         poppic  ebx
    396         mov     esp,ebp         ; esp <- aligned ebp
    397         pop     esp             ; esp <- original ebp
    398         pop     ebp
    399         ret
    400 
    401 ; For some reason, the OS X linker does not honor the request to align the
    402 ; segment unless we do this.
    403         align   16
    404