Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jfdctint.asm - accurate integer FDCT (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; This file contains a slow-but-accurate integer implementation of the
     17 ; forward DCT (Discrete Cosine Transform). The following code is based
     18 ; directly on the IJG's original jfdctint.c; see the jfdctint.c for
     19 ; more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %define CONST_BITS      13
     29 %define PASS1_BITS      2
     30 
     31 %define DESCALE_P1      (CONST_BITS-PASS1_BITS)
     32 %define DESCALE_P2      (CONST_BITS+PASS1_BITS)
     33 
     34 %if CONST_BITS == 13
     35 F_0_298 equ      2446           ; FIX(0.298631336)
     36 F_0_390 equ      3196           ; FIX(0.390180644)
     37 F_0_541 equ      4433           ; FIX(0.541196100)
     38 F_0_765 equ      6270           ; FIX(0.765366865)
     39 F_0_899 equ      7373           ; FIX(0.899976223)
     40 F_1_175 equ      9633           ; FIX(1.175875602)
     41 F_1_501 equ     12299           ; FIX(1.501321110)
     42 F_1_847 equ     15137           ; FIX(1.847759065)
     43 F_1_961 equ     16069           ; FIX(1.961570560)
     44 F_2_053 equ     16819           ; FIX(2.053119869)
     45 F_2_562 equ     20995           ; FIX(2.562915447)
     46 F_3_072 equ     25172           ; FIX(3.072711026)
     47 %else
     48 ; NASM cannot do compile-time arithmetic on floating-point constants.
     49 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
     50 F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
     51 F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
     52 F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
     53 F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
     54 F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
     55 F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
     56 F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
     57 F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
     58 F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
     59 F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
     60 F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
     61 F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
     62 %endif
     63 
     64 ; --------------------------------------------------------------------------
     65         SECTION SEG_CONST
     66 
     67         alignz  16
     68         global  EXTN(jconst_fdct_islow_mmx)
     69 
     70 EXTN(jconst_fdct_islow_mmx):
     71 
     72 PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
     73 PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
     74 PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
     75 PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
     76 PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
     77 PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
     78 PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
     79 PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
     80 PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
     81 PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
     82 PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
     83 
     84         alignz  16
     85 
     86 ; --------------------------------------------------------------------------
     87         SECTION SEG_TEXT
     88         BITS    32
     89 ;
     90 ; Perform the forward DCT on one block of samples.
     91 ;
     92 ; GLOBAL(void)
     93 ; jsimd_fdct_islow_mmx (DCTELEM *data)
     94 ;
     95 
     96 %define data(b)         (b)+8           ; DCTELEM *data
     97 
     98 %define original_ebp    ebp+0
     99 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
    100 %define WK_NUM          2
    101 
    102         align   16
    103         global  EXTN(jsimd_fdct_islow_mmx)
    104 
    105 EXTN(jsimd_fdct_islow_mmx):
    106         push    ebp
    107         mov     eax,esp                         ; eax = original ebp
    108         sub     esp, byte 4
    109         and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
    110         mov     [esp],eax
    111         mov     ebp,esp                         ; ebp = aligned ebp
    112         lea     esp, [wk(0)]
    113         pushpic ebx
    114 ;       push    ecx             ; need not be preserved
    115 ;       push    edx             ; need not be preserved
    116 ;       push    esi             ; unused
    117 ;       push    edi             ; unused
    118 
    119         get_GOT ebx             ; get GOT address
    120 
    121         ; ---- Pass 1: process rows.
    122 
    123         mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
    124         mov     ecx, DCTSIZE/4
    125         alignx  16,7
    126 .rowloop:
    127 
    128         movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
    129         movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
    130         movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
    131         movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
    132 
    133         ; mm0=(20 21 22 23), mm2=(24 25 26 27)
    134         ; mm1=(30 31 32 33), mm3=(34 35 36 37)
    135 
    136         movq      mm4,mm0               ; transpose coefficients(phase 1)
    137         punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
    138         punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
    139         movq      mm5,mm2               ; transpose coefficients(phase 1)
    140         punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
    141         punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
    142 
    143         movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
    144         movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
    145         movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
    146         movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
    147 
    148         ; mm6=(00 01 02 03), mm1=(04 05 06 07)
    149         ; mm7=(10 11 12 13), mm3=(14 15 16 17)
    150 
    151         movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
    152         movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
    153 
    154         movq      mm4,mm6               ; transpose coefficients(phase 1)
    155         punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
    156         punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
    157         movq      mm2,mm1               ; transpose coefficients(phase 1)
    158         punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
    159         punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
    160 
    161         movq      mm7,mm6               ; transpose coefficients(phase 2)
    162         punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
    163         punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
    164         movq      mm3,mm2               ; transpose coefficients(phase 2)
    165         punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
    166         punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
    167 
    168         movq    mm0,mm7
    169         movq    mm5,mm6
    170         psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
    171         psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
    172         paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
    173         paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
    174 
    175         movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
    176         movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
    177         movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
    178         movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
    179 
    180         movq      mm7,mm4               ; transpose coefficients(phase 2)
    181         punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
    182         punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
    183         movq      mm6,mm1               ; transpose coefficients(phase 2)
    184         punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
    185         punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
    186 
    187         movq    mm2,mm7
    188         movq    mm3,mm4
    189         paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
    190         paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
    191         psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
    192         psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
    193 
    194         ; -- Even part
    195 
    196         movq    mm1,mm5
    197         movq    mm6,mm0
    198         paddw   mm5,mm7                 ; mm5=tmp10
    199         paddw   mm0,mm4                 ; mm0=tmp11
    200         psubw   mm1,mm7                 ; mm1=tmp13
    201         psubw   mm6,mm4                 ; mm6=tmp12
    202 
    203         movq    mm7,mm5
    204         paddw   mm5,mm0                 ; mm5=tmp10+tmp11
    205         psubw   mm7,mm0                 ; mm7=tmp10-tmp11
    206 
    207         psllw   mm5,PASS1_BITS          ; mm5=data0
    208         psllw   mm7,PASS1_BITS          ; mm7=data4
    209 
    210         movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
    211         movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
    212 
    213         ; (Original)
    214         ; z1 = (tmp12 + tmp13) * 0.541196100;
    215         ; data2 = z1 + tmp13 * 0.765366865;
    216         ; data6 = z1 + tmp12 * -1.847759065;
    217         ;
    218         ; (This implementation)
    219         ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
    220         ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
    221 
    222         movq      mm4,mm1               ; mm1=tmp13
    223         movq      mm0,mm1
    224         punpcklwd mm4,mm6               ; mm6=tmp12
    225         punpckhwd mm0,mm6
    226         movq      mm1,mm4
    227         movq      mm6,mm0
    228         pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
    229         pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
    230         pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
    231         pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
    232 
    233         paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
    234         paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
    235         psrad   mm4,DESCALE_P1
    236         psrad   mm0,DESCALE_P1
    237         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
    238         paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
    239         psrad   mm1,DESCALE_P1
    240         psrad   mm6,DESCALE_P1
    241 
    242         packssdw  mm4,mm0               ; mm4=data2
    243         packssdw  mm1,mm6               ; mm1=data6
    244 
    245         movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
    246         movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
    247 
    248         ; -- Odd part
    249 
    250         movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
    251         movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
    252 
    253         movq    mm0,mm2                 ; mm2=tmp4
    254         movq    mm6,mm3                 ; mm3=tmp5
    255         paddw   mm0,mm5                 ; mm0=z3
    256         paddw   mm6,mm7                 ; mm6=z4
    257 
    258         ; (Original)
    259         ; z5 = (z3 + z4) * 1.175875602;
    260         ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    261         ; z3 += z5;  z4 += z5;
    262         ;
    263         ; (This implementation)
    264         ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    265         ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    266 
    267         movq      mm4,mm0
    268         movq      mm1,mm0
    269         punpcklwd mm4,mm6
    270         punpckhwd mm1,mm6
    271         movq      mm0,mm4
    272         movq      mm6,mm1
    273         pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
    274         pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
    275         pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
    276         pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
    277 
    278         movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
    279         movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
    280 
    281         ; (Original)
    282         ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
    283         ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
    284         ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
    285         ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    286         ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
    287         ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
    288         ;
    289         ; (This implementation)
    290         ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
    291         ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
    292         ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
    293         ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
    294         ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
    295         ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
    296 
    297         movq      mm4,mm2
    298         movq      mm1,mm2
    299         punpcklwd mm4,mm7
    300         punpckhwd mm1,mm7
    301         movq      mm2,mm4
    302         movq      mm7,mm1
    303         pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
    304         pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
    305         pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
    306         pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
    307 
    308         paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
    309         paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
    310         paddd   mm2,mm0                 ; mm2=data1L
    311         paddd   mm7,mm6                 ; mm7=data1H
    312 
    313         paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
    314         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
    315         psrad   mm4,DESCALE_P1
    316         psrad   mm1,DESCALE_P1
    317         paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
    318         paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
    319         psrad   mm2,DESCALE_P1
    320         psrad   mm7,DESCALE_P1
    321 
    322         packssdw  mm4,mm1               ; mm4=data7
    323         packssdw  mm2,mm7               ; mm2=data1
    324 
    325         movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
    326         movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
    327 
    328         movq      mm1,mm3
    329         movq      mm7,mm3
    330         punpcklwd mm1,mm5
    331         punpckhwd mm7,mm5
    332         movq      mm3,mm1
    333         movq      mm5,mm7
    334         pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
    335         pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
    336         pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
    337         pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
    338 
    339         paddd   mm1,mm0                 ; mm1=data5L
    340         paddd   mm7,mm6                 ; mm7=data5H
    341         paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
    342         paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
    343 
    344         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
    345         paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
    346         psrad   mm1,DESCALE_P1
    347         psrad   mm7,DESCALE_P1
    348         paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
    349         paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
    350         psrad   mm3,DESCALE_P1
    351         psrad   mm5,DESCALE_P1
    352 
    353         packssdw  mm1,mm7               ; mm1=data5
    354         packssdw  mm3,mm5               ; mm3=data3
    355 
    356         movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
    357         movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
    358 
    359         add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
    360         dec     ecx
    361         jnz     near .rowloop
    362 
    363         ; ---- Pass 2: process columns.
    364 
    365         mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
    366         mov     ecx, DCTSIZE/4
    367         alignx  16,7
    368 .columnloop:
    369 
    370         movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
    371         movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
    372         movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
    373         movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
    374 
    375         ; mm0=(02 12 22 32), mm2=(42 52 62 72)
    376         ; mm1=(03 13 23 33), mm3=(43 53 63 73)
    377 
    378         movq      mm4,mm0               ; transpose coefficients(phase 1)
    379         punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
    380         punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
    381         movq      mm5,mm2               ; transpose coefficients(phase 1)
    382         punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
    383         punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
    384 
    385         movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
    386         movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
    387         movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
    388         movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
    389 
    390         ; mm6=(00 10 20 30), mm1=(40 50 60 70)
    391         ; mm7=(01 11 21 31), mm3=(41 51 61 71)
    392 
    393         movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
    394         movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
    395 
    396         movq      mm4,mm6               ; transpose coefficients(phase 1)
    397         punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
    398         punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
    399         movq      mm2,mm1               ; transpose coefficients(phase 1)
    400         punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
    401         punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
    402 
    403         movq      mm7,mm6               ; transpose coefficients(phase 2)
    404         punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
    405         punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
    406         movq      mm3,mm2               ; transpose coefficients(phase 2)
    407         punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
    408         punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
    409 
    410         movq    mm0,mm7
    411         movq    mm5,mm6
    412         psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
    413         psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
    414         paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
    415         paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
    416 
    417         movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
    418         movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
    419         movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
    420         movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
    421 
    422         movq      mm7,mm4               ; transpose coefficients(phase 2)
    423         punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
    424         punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
    425         movq      mm6,mm1               ; transpose coefficients(phase 2)
    426         punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
    427         punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
    428 
    429         movq    mm2,mm7
    430         movq    mm3,mm4
    431         paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
    432         paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
    433         psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
    434         psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
    435 
    436         ; -- Even part
    437 
    438         movq    mm1,mm5
    439         movq    mm6,mm0
    440         paddw   mm5,mm7                 ; mm5=tmp10
    441         paddw   mm0,mm4                 ; mm0=tmp11
    442         psubw   mm1,mm7                 ; mm1=tmp13
    443         psubw   mm6,mm4                 ; mm6=tmp12
    444 
    445         movq    mm7,mm5
    446         paddw   mm5,mm0                 ; mm5=tmp10+tmp11
    447         psubw   mm7,mm0                 ; mm7=tmp10-tmp11
    448 
    449         paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
    450         paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
    451         psraw   mm5,PASS1_BITS          ; mm5=data0
    452         psraw   mm7,PASS1_BITS          ; mm7=data4
    453 
    454         movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
    455         movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
    456 
    457         ; (Original)
    458         ; z1 = (tmp12 + tmp13) * 0.541196100;
    459         ; data2 = z1 + tmp13 * 0.765366865;
    460         ; data6 = z1 + tmp12 * -1.847759065;
    461         ;
    462         ; (This implementation)
    463         ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
    464         ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
    465 
    466         movq      mm4,mm1               ; mm1=tmp13
    467         movq      mm0,mm1
    468         punpcklwd mm4,mm6               ; mm6=tmp12
    469         punpckhwd mm0,mm6
    470         movq      mm1,mm4
    471         movq      mm6,mm0
    472         pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
    473         pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
    474         pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
    475         pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
    476 
    477         paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
    478         paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
    479         psrad   mm4,DESCALE_P2
    480         psrad   mm0,DESCALE_P2
    481         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
    482         paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
    483         psrad   mm1,DESCALE_P2
    484         psrad   mm6,DESCALE_P2
    485 
    486         packssdw  mm4,mm0               ; mm4=data2
    487         packssdw  mm1,mm6               ; mm1=data6
    488 
    489         movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
    490         movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
    491 
    492         ; -- Odd part
    493 
    494         movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
    495         movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
    496 
    497         movq    mm0,mm2                 ; mm2=tmp4
    498         movq    mm6,mm3                 ; mm3=tmp5
    499         paddw   mm0,mm5                 ; mm0=z3
    500         paddw   mm6,mm7                 ; mm6=z4
    501 
    502         ; (Original)
    503         ; z5 = (z3 + z4) * 1.175875602;
    504         ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    505         ; z3 += z5;  z4 += z5;
    506         ;
    507         ; (This implementation)
    508         ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    509         ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    510 
    511         movq      mm4,mm0
    512         movq      mm1,mm0
    513         punpcklwd mm4,mm6
    514         punpckhwd mm1,mm6
    515         movq      mm0,mm4
    516         movq      mm6,mm1
    517         pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
    518         pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
    519         pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
    520         pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
    521 
    522         movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
    523         movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
    524 
    525         ; (Original)
    526         ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
    527         ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
    528         ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
    529         ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    530         ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
    531         ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
    532         ;
    533         ; (This implementation)
    534         ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
    535         ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
    536         ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
    537         ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
    538         ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
    539         ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
    540 
    541         movq      mm4,mm2
    542         movq      mm1,mm2
    543         punpcklwd mm4,mm7
    544         punpckhwd mm1,mm7
    545         movq      mm2,mm4
    546         movq      mm7,mm1
    547         pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
    548         pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
    549         pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
    550         pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
    551 
    552         paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
    553         paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
    554         paddd   mm2,mm0                 ; mm2=data1L
    555         paddd   mm7,mm6                 ; mm7=data1H
    556 
    557         paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
    558         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
    559         psrad   mm4,DESCALE_P2
    560         psrad   mm1,DESCALE_P2
    561         paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
    562         paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
    563         psrad   mm2,DESCALE_P2
    564         psrad   mm7,DESCALE_P2
    565 
    566         packssdw  mm4,mm1               ; mm4=data7
    567         packssdw  mm2,mm7               ; mm2=data1
    568 
    569         movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
    570         movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
    571 
    572         movq      mm1,mm3
    573         movq      mm7,mm3
    574         punpcklwd mm1,mm5
    575         punpckhwd mm7,mm5
    576         movq      mm3,mm1
    577         movq      mm5,mm7
    578         pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
    579         pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
    580         pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
    581         pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
    582 
    583         paddd   mm1,mm0                 ; mm1=data5L
    584         paddd   mm7,mm6                 ; mm7=data5H
    585         paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
    586         paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
    587 
    588         paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
    589         paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
    590         psrad   mm1,DESCALE_P2
    591         psrad   mm7,DESCALE_P2
    592         paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
    593         paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
    594         psrad   mm3,DESCALE_P2
    595         psrad   mm5,DESCALE_P2
    596 
    597         packssdw  mm1,mm7               ; mm1=data5
    598         packssdw  mm3,mm5               ; mm3=data3
    599 
    600         movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
    601         movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
    602 
    603         add     edx, byte 4*SIZEOF_DCTELEM
    604         dec     ecx
    605         jnz     near .columnloop
    606 
    607         emms            ; empty MMX state
    608 
    609 ;       pop     edi             ; unused
    610 ;       pop     esi             ; unused
    611 ;       pop     edx             ; need not be preserved
    612 ;       pop     ecx             ; need not be preserved
    613         poppic  ebx
    614         mov     esp,ebp         ; esp <- aligned ebp
    615         pop     esp             ; esp <- original ebp
    616         pop     ebp
    617         ret
    618 
    619 ; For some reason, the OS X linker does not honor the request to align the
    620 ; segment unless we do this.
    621         align   16
    622