Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jfdctfst.asm - fast integer FDCT (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; This file contains a fast, not so accurate integer implementation of
     19 ; the forward DCT (Discrete Cosine Transform). The following code is
     20 ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
     21 ; for more details.
     22 ;
     23 ; [TAB8]
     24 
     25 %include "jsimdext.inc"
     26 %include "jdct.inc"
     27 
     28 ; --------------------------------------------------------------------------
     29 
     30 %define CONST_BITS      8       ; 14 is also OK.
     31 
     32 %if CONST_BITS == 8
     33 F_0_382 equ      98             ; FIX(0.382683433)
     34 F_0_541 equ     139             ; FIX(0.541196100)
     35 F_0_707 equ     181             ; FIX(0.707106781)
     36 F_1_306 equ     334             ; FIX(1.306562965)
     37 %else
     38 ; NASM cannot do compile-time arithmetic on floating-point constants.
     39 %define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
     40 F_0_382 equ     DESCALE( 410903207,30-CONST_BITS)       ; FIX(0.382683433)
     41 F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
     42 F_0_707 equ     DESCALE( 759250124,30-CONST_BITS)       ; FIX(0.707106781)
     43 F_1_306 equ     DESCALE(1402911301,30-CONST_BITS)       ; FIX(1.306562965)
     44 %endif
     45 
     46 ; --------------------------------------------------------------------------
     47         SECTION SEG_CONST
     48 
     49 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
     50 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
     51 
     52 %define PRE_MULTIPLY_SCALE_BITS   2
     53 %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
     54 
     55         alignz  16
     56         global  EXTN(jconst_fdct_ifast_sse2)
     57 
     58 EXTN(jconst_fdct_ifast_sse2):
     59 
     60 PW_F0707        times 8 dw  F_0_707 << CONST_SHIFT
     61 PW_F0382        times 8 dw  F_0_382 << CONST_SHIFT
     62 PW_F0541        times 8 dw  F_0_541 << CONST_SHIFT
     63 PW_F1306        times 8 dw  F_1_306 << CONST_SHIFT
     64 
     65         alignz  16
     66 
     67 ; --------------------------------------------------------------------------
     68         SECTION SEG_TEXT
     69         BITS    64
     70 ;
     71 ; Perform the forward DCT on one block of samples.
     72 ;
     73 ; GLOBAL(void)
     74 ; jsimd_fdct_ifast_sse2 (DCTELEM * data)
     75 ;
     76 
     77 ; r10 = DCTELEM * data
     78 
     79 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     80 %define WK_NUM          2
     81 
     82         align   16
     83         global  EXTN(jsimd_fdct_ifast_sse2)
     84 
     85 EXTN(jsimd_fdct_ifast_sse2):
     86         push    rbp
     87         mov     rax,rsp                         ; rax = original rbp
     88         sub     rsp, byte 4
     89         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     90         mov     [rsp],rax
     91         mov     rbp,rsp                         ; rbp = aligned rbp
     92         lea     rsp, [wk(0)]
     93         collect_args
     94 
     95         ; ---- Pass 1: process rows.
     96 
     97         mov     rdx, r10        ; (DCTELEM *)
     98 
     99         movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)]
    100         movdqa  xmm1, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)]
    101         movdqa  xmm2, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)]
    102         movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)]
    103 
    104         ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27)
    105         ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37)
    106 
    107         movdqa    xmm4,xmm0             ; transpose coefficients(phase 1)
    108         punpcklwd xmm0,xmm1             ; xmm0=(00 10 01 11 02 12 03 13)
    109         punpckhwd xmm4,xmm1             ; xmm4=(04 14 05 15 06 16 07 17)
    110         movdqa    xmm5,xmm2             ; transpose coefficients(phase 1)
    111         punpcklwd xmm2,xmm3             ; xmm2=(20 30 21 31 22 32 23 33)
    112         punpckhwd xmm5,xmm3             ; xmm5=(24 34 25 35 26 36 27 37)
    113 
    114         movdqa  xmm6, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)]
    115         movdqa  xmm7, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)]
    116         movdqa  xmm1, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)]
    117         movdqa  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)]
    118 
    119         ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62)
    120         ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63)
    121 
    122         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(20 30 21 31 22 32 23 33)
    123         movdqa  XMMWORD [wk(1)], xmm5   ; wk(1)=(24 34 25 35 26 36 27 37)
    124 
    125         movdqa    xmm2,xmm6             ; transpose coefficients(phase 1)
    126         punpcklwd xmm6,xmm7             ; xmm6=(40 50 41 51 42 52 43 53)
    127         punpckhwd xmm2,xmm7             ; xmm2=(44 54 45 55 46 56 47 57)
    128         movdqa    xmm5,xmm1             ; transpose coefficients(phase 1)
    129         punpcklwd xmm1,xmm3             ; xmm1=(60 70 61 71 62 72 63 73)
    130         punpckhwd xmm5,xmm3             ; xmm5=(64 74 65 75 66 76 67 77)
    131 
    132         movdqa    xmm7,xmm6             ; transpose coefficients(phase 2)
    133         punpckldq xmm6,xmm1             ; xmm6=(40 50 60 70 41 51 61 71)
    134         punpckhdq xmm7,xmm1             ; xmm7=(42 52 62 72 43 53 63 73)
    135         movdqa    xmm3,xmm2             ; transpose coefficients(phase 2)
    136         punpckldq xmm2,xmm5             ; xmm2=(44 54 64 74 45 55 65 75)
    137         punpckhdq xmm3,xmm5             ; xmm3=(46 56 66 76 47 57 67 77)
    138 
    139         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(20 30 21 31 22 32 23 33)
    140         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(24 34 25 35 26 36 27 37)
    141         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(42 52 62 72 43 53 63 73)
    142         movdqa  XMMWORD [wk(1)], xmm2   ; wk(1)=(44 54 64 74 45 55 65 75)
    143 
    144         movdqa    xmm7,xmm0             ; transpose coefficients(phase 2)
    145         punpckldq xmm0,xmm1             ; xmm0=(00 10 20 30 01 11 21 31)
    146         punpckhdq xmm7,xmm1             ; xmm7=(02 12 22 32 03 13 23 33)
    147         movdqa    xmm2,xmm4             ; transpose coefficients(phase 2)
    148         punpckldq xmm4,xmm5             ; xmm4=(04 14 24 34 05 15 25 35)
    149         punpckhdq xmm2,xmm5             ; xmm2=(06 16 26 36 07 17 27 37)
    150 
    151         movdqa     xmm1,xmm0            ; transpose coefficients(phase 3)
    152         punpcklqdq xmm0,xmm6            ; xmm0=(00 10 20 30 40 50 60 70)=data0
    153         punpckhqdq xmm1,xmm6            ; xmm1=(01 11 21 31 41 51 61 71)=data1
    154         movdqa     xmm5,xmm2            ; transpose coefficients(phase 3)
    155         punpcklqdq xmm2,xmm3            ; xmm2=(06 16 26 36 46 56 66 76)=data6
    156         punpckhqdq xmm5,xmm3            ; xmm5=(07 17 27 37 47 57 67 77)=data7
    157 
    158         movdqa  xmm6,xmm1
    159         movdqa  xmm3,xmm0
    160         psubw   xmm1,xmm2               ; xmm1=data1-data6=tmp6
    161         psubw   xmm0,xmm5               ; xmm0=data0-data7=tmp7
    162         paddw   xmm6,xmm2               ; xmm6=data1+data6=tmp1
    163         paddw   xmm3,xmm5               ; xmm3=data0+data7=tmp0
    164 
    165         movdqa  xmm2, XMMWORD [wk(0)]   ; xmm2=(42 52 62 72 43 53 63 73)
    166         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=(44 54 64 74 45 55 65 75)
    167         movdqa  XMMWORD [wk(0)], xmm1   ; wk(0)=tmp6
    168         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=tmp7
    169 
    170         movdqa     xmm1,xmm7            ; transpose coefficients(phase 3)
    171         punpcklqdq xmm7,xmm2            ; xmm7=(02 12 22 32 42 52 62 72)=data2
    172         punpckhqdq xmm1,xmm2            ; xmm1=(03 13 23 33 43 53 63 73)=data3
    173         movdqa     xmm0,xmm4            ; transpose coefficients(phase 3)
    174         punpcklqdq xmm4,xmm5            ; xmm4=(04 14 24 34 44 54 64 74)=data4
    175         punpckhqdq xmm0,xmm5            ; xmm0=(05 15 25 35 45 55 65 75)=data5
    176 
    177         movdqa  xmm2,xmm1
    178         movdqa  xmm5,xmm7
    179         paddw   xmm1,xmm4               ; xmm1=data3+data4=tmp3
    180         paddw   xmm7,xmm0               ; xmm7=data2+data5=tmp2
    181         psubw   xmm2,xmm4               ; xmm2=data3-data4=tmp4
    182         psubw   xmm5,xmm0               ; xmm5=data2-data5=tmp5
    183 
    184         ; -- Even part
    185 
    186         movdqa  xmm4,xmm3
    187         movdqa  xmm0,xmm6
    188         psubw   xmm3,xmm1               ; xmm3=tmp13
    189         psubw   xmm6,xmm7               ; xmm6=tmp12
    190         paddw   xmm4,xmm1               ; xmm4=tmp10
    191         paddw   xmm0,xmm7               ; xmm0=tmp11
    192 
    193         paddw   xmm6,xmm3
    194         psllw   xmm6,PRE_MULTIPLY_SCALE_BITS
    195         pmulhw  xmm6,[rel PW_F0707] ; xmm6=z1
    196 
    197         movdqa  xmm1,xmm4
    198         movdqa  xmm7,xmm3
    199         psubw   xmm4,xmm0               ; xmm4=data4
    200         psubw   xmm3,xmm6               ; xmm3=data6
    201         paddw   xmm1,xmm0               ; xmm1=data0
    202         paddw   xmm7,xmm6               ; xmm7=data2
    203 
    204         movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=tmp6
    205         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=tmp7
    206         movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=data4
    207         movdqa  XMMWORD [wk(1)], xmm3   ; wk(1)=data6
    208 
    209         ; -- Odd part
    210 
    211         paddw   xmm2,xmm5               ; xmm2=tmp10
    212         paddw   xmm5,xmm0               ; xmm5=tmp11
    213         paddw   xmm0,xmm6               ; xmm0=tmp12, xmm6=tmp7
    214 
    215         psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
    216         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
    217 
    218         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
    219         pmulhw  xmm5,[rel PW_F0707] ; xmm5=z3
    220 
    221         movdqa  xmm4,xmm2               ; xmm4=tmp10
    222         psubw   xmm2,xmm0
    223         pmulhw  xmm2,[rel PW_F0382] ; xmm2=z5
    224         pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
    225         pmulhw  xmm0,[rel PW_F1306] ; xmm0=MULTIPLY(tmp12,FIX_1_306562)
    226         paddw   xmm4,xmm2               ; xmm4=z2
    227         paddw   xmm0,xmm2               ; xmm0=z4
    228 
    229         movdqa  xmm3,xmm6
    230         psubw   xmm6,xmm5               ; xmm6=z13
    231         paddw   xmm3,xmm5               ; xmm3=z11
    232 
    233         movdqa  xmm2,xmm6
    234         movdqa  xmm5,xmm3
    235         psubw   xmm6,xmm4               ; xmm6=data3
    236         psubw   xmm3,xmm0               ; xmm3=data7
    237         paddw   xmm2,xmm4               ; xmm2=data5
    238         paddw   xmm5,xmm0               ; xmm5=data1
    239 
    240         ; ---- Pass 2: process columns.
    241 
    242         ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72)
    243         ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73)
    244 
    245         movdqa    xmm4,xmm1             ; transpose coefficients(phase 1)
    246         punpcklwd xmm1,xmm5             ; xmm1=(00 01 10 11 20 21 30 31)
    247         punpckhwd xmm4,xmm5             ; xmm4=(40 41 50 51 60 61 70 71)
    248         movdqa    xmm0,xmm7             ; transpose coefficients(phase 1)
    249         punpcklwd xmm7,xmm6             ; xmm7=(02 03 12 13 22 23 32 33)
    250         punpckhwd xmm0,xmm6             ; xmm0=(42 43 52 53 62 63 72 73)
    251 
    252         movdqa  xmm5, XMMWORD [wk(0)]   ; xmm5=col4
    253         movdqa  xmm6, XMMWORD [wk(1)]   ; xmm6=col6
    254 
    255         ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76)
    256         ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77)
    257 
    258         movdqa  XMMWORD [wk(0)], xmm7   ; wk(0)=(02 03 12 13 22 23 32 33)
    259         movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(42 43 52 53 62 63 72 73)
    260 
    261         movdqa    xmm7,xmm5             ; transpose coefficients(phase 1)
    262         punpcklwd xmm5,xmm2             ; xmm5=(04 05 14 15 24 25 34 35)
    263         punpckhwd xmm7,xmm2             ; xmm7=(44 45 54 55 64 65 74 75)
    264         movdqa    xmm0,xmm6             ; transpose coefficients(phase 1)
    265         punpcklwd xmm6,xmm3             ; xmm6=(06 07 16 17 26 27 36 37)
    266         punpckhwd xmm0,xmm3             ; xmm0=(46 47 56 57 66 67 76 77)
    267 
    268         movdqa    xmm2,xmm5             ; transpose coefficients(phase 2)
    269         punpckldq xmm5,xmm6             ; xmm5=(04 05 06 07 14 15 16 17)
    270         punpckhdq xmm2,xmm6             ; xmm2=(24 25 26 27 34 35 36 37)
    271         movdqa    xmm3,xmm7             ; transpose coefficients(phase 2)
    272         punpckldq xmm7,xmm0             ; xmm7=(44 45 46 47 54 55 56 57)
    273         punpckhdq xmm3,xmm0             ; xmm3=(64 65 66 67 74 75 76 77)
    274 
    275         movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=(02 03 12 13 22 23 32 33)
    276         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(42 43 52 53 62 63 72 73)
    277         movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=(24 25 26 27 34 35 36 37)
    278         movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(44 45 46 47 54 55 56 57)
    279 
    280         movdqa    xmm2,xmm1             ; transpose coefficients(phase 2)
    281         punpckldq xmm1,xmm6             ; xmm1=(00 01 02 03 10 11 12 13)
    282         punpckhdq xmm2,xmm6             ; xmm2=(20 21 22 23 30 31 32 33)
    283         movdqa    xmm7,xmm4             ; transpose coefficients(phase 2)
    284         punpckldq xmm4,xmm0             ; xmm4=(40 41 42 43 50 51 52 53)
    285         punpckhdq xmm7,xmm0             ; xmm7=(60 61 62 63 70 71 72 73)
    286 
    287         movdqa     xmm6,xmm1            ; transpose coefficients(phase 3)
    288         punpcklqdq xmm1,xmm5            ; xmm1=(00 01 02 03 04 05 06 07)=data0
    289         punpckhqdq xmm6,xmm5            ; xmm6=(10 11 12 13 14 15 16 17)=data1
    290         movdqa     xmm0,xmm7            ; transpose coefficients(phase 3)
    291         punpcklqdq xmm7,xmm3            ; xmm7=(60 61 62 63 64 65 66 67)=data6
    292         punpckhqdq xmm0,xmm3            ; xmm0=(70 71 72 73 74 75 76 77)=data7
    293 
    294         movdqa  xmm5,xmm6
    295         movdqa  xmm3,xmm1
    296         psubw   xmm6,xmm7               ; xmm6=data1-data6=tmp6
    297         psubw   xmm1,xmm0               ; xmm1=data0-data7=tmp7
    298         paddw   xmm5,xmm7               ; xmm5=data1+data6=tmp1
    299         paddw   xmm3,xmm0               ; xmm3=data0+data7=tmp0
    300 
    301         movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=(24 25 26 27 34 35 36 37)
    302         movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(44 45 46 47 54 55 56 57)
    303         movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=tmp6
    304         movdqa  XMMWORD [wk(1)], xmm1   ; wk(1)=tmp7
    305 
    306         movdqa     xmm6,xmm2            ; transpose coefficients(phase 3)
    307         punpcklqdq xmm2,xmm7            ; xmm2=(20 21 22 23 24 25 26 27)=data2
    308         punpckhqdq xmm6,xmm7            ; xmm6=(30 31 32 33 34 35 36 37)=data3
    309         movdqa     xmm1,xmm4            ; transpose coefficients(phase 3)
    310         punpcklqdq xmm4,xmm0            ; xmm4=(40 41 42 43 44 45 46 47)=data4
    311         punpckhqdq xmm1,xmm0            ; xmm1=(50 51 52 53 54 55 56 57)=data5
    312 
    313         movdqa  xmm7,xmm6
    314         movdqa  xmm0,xmm2
    315         paddw   xmm6,xmm4               ; xmm6=data3+data4=tmp3
    316         paddw   xmm2,xmm1               ; xmm2=data2+data5=tmp2
    317         psubw   xmm7,xmm4               ; xmm7=data3-data4=tmp4
    318         psubw   xmm0,xmm1               ; xmm0=data2-data5=tmp5
    319 
    320         ; -- Even part
    321 
    322         movdqa  xmm4,xmm3
    323         movdqa  xmm1,xmm5
    324         psubw   xmm3,xmm6               ; xmm3=tmp13
    325         psubw   xmm5,xmm2               ; xmm5=tmp12
    326         paddw   xmm4,xmm6               ; xmm4=tmp10
    327         paddw   xmm1,xmm2               ; xmm1=tmp11
    328 
    329         paddw   xmm5,xmm3
    330         psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
    331         pmulhw  xmm5,[rel PW_F0707] ; xmm5=z1
    332 
    333         movdqa  xmm6,xmm4
    334         movdqa  xmm2,xmm3
    335         psubw   xmm4,xmm1               ; xmm4=data4
    336         psubw   xmm3,xmm5               ; xmm3=data6
    337         paddw   xmm6,xmm1               ; xmm6=data0
    338         paddw   xmm2,xmm5               ; xmm2=data2
    339 
    340         movdqa  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_DCTELEM)], xmm4
    341         movdqa  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_DCTELEM)], xmm3
    342         movdqa  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_DCTELEM)], xmm6
    343         movdqa  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_DCTELEM)], xmm2
    344 
    345         ; -- Odd part
    346 
    347         movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp6
    348         movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp7
    349 
    350         paddw   xmm7,xmm0               ; xmm7=tmp10
    351         paddw   xmm0,xmm1               ; xmm0=tmp11
    352         paddw   xmm1,xmm5               ; xmm1=tmp12, xmm5=tmp7
    353 
    354         psllw   xmm7,PRE_MULTIPLY_SCALE_BITS
    355         psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
    356 
    357         psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
    358         pmulhw  xmm0,[rel PW_F0707] ; xmm0=z3
    359 
    360         movdqa  xmm4,xmm7               ; xmm4=tmp10
    361         psubw   xmm7,xmm1
    362         pmulhw  xmm7,[rel PW_F0382] ; xmm7=z5
    363         pmulhw  xmm4,[rel PW_F0541] ; xmm4=MULTIPLY(tmp10,FIX_0_541196)
    364         pmulhw  xmm1,[rel PW_F1306] ; xmm1=MULTIPLY(tmp12,FIX_1_306562)
    365         paddw   xmm4,xmm7               ; xmm4=z2
    366         paddw   xmm1,xmm7               ; xmm1=z4
    367 
    368         movdqa  xmm3,xmm5
    369         psubw   xmm5,xmm0               ; xmm5=z13
    370         paddw   xmm3,xmm0               ; xmm3=z11
    371 
    372         movdqa  xmm6,xmm5
    373         movdqa  xmm2,xmm3
    374         psubw   xmm5,xmm4               ; xmm5=data3
    375         psubw   xmm3,xmm1               ; xmm3=data7
    376         paddw   xmm6,xmm4               ; xmm6=data5
    377         paddw   xmm2,xmm1               ; xmm2=data1
    378 
    379         movdqa  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_DCTELEM)], xmm5
    380         movdqa  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_DCTELEM)], xmm3
    381         movdqa  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_DCTELEM)], xmm6
    382         movdqa  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_DCTELEM)], xmm2
    383 
    384         uncollect_args
    385         mov     rsp,rbp         ; rsp <- aligned rbp
    386         pop     rsp             ; rsp <- original rbp
    387         pop     rbp
    388         ret
    389 
    390 ; For some reason, the OS X linker does not honor the request to align the
    391 ; segment unless we do this.
    392         align   16
    393