Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains a floating-point implementation of the forward DCT
     18 ; (Discrete Cosine Transform). The following code is based directly on
     19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     29         shufps  %1,%2,0x44
     30 %endmacro
     31 
     32 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     33         shufps  %1,%2,0xEE
     34 %endmacro
     35 
     36 ; --------------------------------------------------------------------------
     37         SECTION SEG_CONST
     38 
     39         alignz  16
     40         global  EXTN(jconst_fdct_float_sse)
     41 
     42 EXTN(jconst_fdct_float_sse):
     43 
     44 PD_0_382        times 4 dd  0.382683432365089771728460
     45 PD_0_707        times 4 dd  0.707106781186547524400844
     46 PD_0_541        times 4 dd  0.541196100146196984399723
     47 PD_1_306        times 4 dd  1.306562964876376527856643
     48 
     49         alignz  16
     50 
     51 ; --------------------------------------------------------------------------
     52         SECTION SEG_TEXT
     53         BITS    64
     54 ;
     55 ; Perform the forward DCT on one block of samples.
     56 ;
     57 ; GLOBAL(void)
     58 ; jsimd_fdct_float_sse (FAST_FLOAT *data)
     59 ;
     60 
     61 ; r10 = FAST_FLOAT *data
     62 
     63 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
     64 %define WK_NUM          2
     65 
     66         align   16
     67         global  EXTN(jsimd_fdct_float_sse)
     68 
     69 EXTN(jsimd_fdct_float_sse):
     70         push    rbp
     71         mov     rax,rsp                         ; rax = original rbp
     72         sub     rsp, byte 4
     73         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
     74         mov     [rsp],rax
     75         mov     rbp,rsp                         ; rbp = aligned rbp
     76         lea     rsp, [wk(0)]
     77         collect_args
     78 
     79         ; ---- Pass 1: process rows.
     80 
     81         mov     rdx, r10        ; (FAST_FLOAT *)
     82         mov     rcx, DCTSIZE/4
     83 .rowloop:
     84 
     85         movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
     86         movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
     87         movaps  xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
     88         movaps  xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
     89 
     90         ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
     91         ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
     92 
     93         movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
     94         unpcklps xmm0,xmm1              ; xmm0=(20 30 21 31)
     95         unpckhps xmm4,xmm1              ; xmm4=(22 32 23 33)
     96         movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
     97         unpcklps xmm2,xmm3              ; xmm2=(24 34 25 35)
     98         unpckhps xmm5,xmm3              ; xmm5=(26 36 27 37)
     99 
    100         movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
    101         movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
    102         movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
    103         movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
    104 
    105         ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
    106         ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
    107 
    108         movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
    109         movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
    110 
    111         movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
    112         unpcklps xmm6,xmm7              ; xmm6=(00 10 01 11)
    113         unpckhps xmm4,xmm7              ; xmm4=(02 12 03 13)
    114         movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
    115         unpcklps xmm1,xmm3              ; xmm1=(04 14 05 15)
    116         unpckhps xmm2,xmm3              ; xmm2=(06 16 07 17)
    117 
    118         movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
    119         unpcklps2 xmm6,xmm0             ; xmm6=(00 10 20 30)=data0
    120         unpckhps2 xmm7,xmm0             ; xmm7=(01 11 21 31)=data1
    121         movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
    122         unpcklps2 xmm2,xmm5             ; xmm2=(06 16 26 36)=data6
    123         unpckhps2 xmm3,xmm5             ; xmm3=(07 17 27 37)=data7
    124 
    125         movaps  xmm0,xmm7
    126         movaps  xmm5,xmm6
    127         subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
    128         subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
    129         addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
    130         addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
    131 
    132         movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
    133         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
    134         movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
    135         movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
    136 
    137         movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
    138         unpcklps2 xmm4,xmm2             ; xmm4=(02 12 22 32)=data2
    139         unpckhps2 xmm7,xmm2             ; xmm7=(03 13 23 33)=data3
    140         movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
    141         unpcklps2 xmm1,xmm3             ; xmm1=(04 14 24 34)=data4
    142         unpckhps2 xmm6,xmm3             ; xmm6=(05 15 25 35)=data5
    143 
    144         movaps  xmm2,xmm7
    145         movaps  xmm3,xmm4
    146         addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
    147         addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
    148         subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
    149         subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
    150 
    151         ; -- Even part
    152 
    153         movaps  xmm1,xmm5
    154         movaps  xmm6,xmm0
    155         subps   xmm5,xmm7               ; xmm5=tmp13
    156         subps   xmm0,xmm4               ; xmm0=tmp12
    157         addps   xmm1,xmm7               ; xmm1=tmp10
    158         addps   xmm6,xmm4               ; xmm6=tmp11
    159 
    160         addps   xmm0,xmm5
    161         mulps   xmm0,[rel PD_0_707] ; xmm0=z1
    162 
    163         movaps  xmm7,xmm1
    164         movaps  xmm4,xmm5
    165         subps   xmm1,xmm6               ; xmm1=data4
    166         subps   xmm5,xmm0               ; xmm5=data6
    167         addps   xmm7,xmm6               ; xmm7=data0
    168         addps   xmm4,xmm0               ; xmm4=data2
    169 
    170         movaps  XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
    171         movaps  XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
    172         movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    173         movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    174 
    175         ; -- Odd part
    176 
    177         movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
    178         movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
    179 
    180         addps   xmm2,xmm3               ; xmm2=tmp10
    181         addps   xmm3,xmm6               ; xmm3=tmp11
    182         addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
    183 
    184         mulps   xmm3,[rel PD_0_707] ; xmm3=z3
    185 
    186         movaps  xmm1,xmm2               ; xmm1=tmp10
    187         subps   xmm2,xmm6
    188         mulps   xmm2,[rel PD_0_382] ; xmm2=z5
    189         mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
    190         mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
    191         addps   xmm1,xmm2               ; xmm1=z2
    192         addps   xmm6,xmm2               ; xmm6=z4
    193 
    194         movaps  xmm5,xmm0
    195         subps   xmm0,xmm3               ; xmm0=z13
    196         addps   xmm5,xmm3               ; xmm5=z11
    197 
    198         movaps  xmm7,xmm0
    199         movaps  xmm4,xmm5
    200         subps   xmm0,xmm1               ; xmm0=data3
    201         subps   xmm5,xmm6               ; xmm5=data7
    202         addps   xmm7,xmm1               ; xmm7=data5
    203         addps   xmm4,xmm6               ; xmm4=data1
    204 
    205         movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
    206         movaps  XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
    207         movaps  XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
    208         movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    209 
    210         add     rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
    211         dec     rcx
    212         jnz     near .rowloop
    213 
    214         ; ---- Pass 2: process columns.
    215 
    216         mov     rdx, r10        ; (FAST_FLOAT *)
    217         mov     rcx, DCTSIZE/4
    218 .columnloop:
    219 
    220         movaps  xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
    221         movaps  xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
    222         movaps  xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
    223         movaps  xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
    224 
    225         ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
    226         ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
    227 
    228         movaps   xmm4,xmm0              ; transpose coefficients(phase 1)
    229         unpcklps xmm0,xmm1              ; xmm0=(02 03 12 13)
    230         unpckhps xmm4,xmm1              ; xmm4=(22 23 32 33)
    231         movaps   xmm5,xmm2              ; transpose coefficients(phase 1)
    232         unpcklps xmm2,xmm3              ; xmm2=(42 43 52 53)
    233         unpckhps xmm5,xmm3              ; xmm5=(62 63 72 73)
    234 
    235         movaps  xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
    236         movaps  xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
    237         movaps  xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
    238         movaps  xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
    239 
    240         ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
    241         ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
    242 
    243         movaps  XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
    244         movaps  XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
    245 
    246         movaps   xmm4,xmm6              ; transpose coefficients(phase 1)
    247         unpcklps xmm6,xmm7              ; xmm6=(00 01 10 11)
    248         unpckhps xmm4,xmm7              ; xmm4=(20 21 30 31)
    249         movaps   xmm2,xmm1              ; transpose coefficients(phase 1)
    250         unpcklps xmm1,xmm3              ; xmm1=(40 41 50 51)
    251         unpckhps xmm2,xmm3              ; xmm2=(60 61 70 71)
    252 
    253         movaps    xmm7,xmm6             ; transpose coefficients(phase 2)
    254         unpcklps2 xmm6,xmm0             ; xmm6=(00 01 02 03)=data0
    255         unpckhps2 xmm7,xmm0             ; xmm7=(10 11 12 13)=data1
    256         movaps    xmm3,xmm2             ; transpose coefficients(phase 2)
    257         unpcklps2 xmm2,xmm5             ; xmm2=(60 61 62 63)=data6
    258         unpckhps2 xmm3,xmm5             ; xmm3=(70 71 72 73)=data7
    259 
    260         movaps  xmm0,xmm7
    261         movaps  xmm5,xmm6
    262         subps   xmm7,xmm2               ; xmm7=data1-data6=tmp6
    263         subps   xmm6,xmm3               ; xmm6=data0-data7=tmp7
    264         addps   xmm0,xmm2               ; xmm0=data1+data6=tmp1
    265         addps   xmm5,xmm3               ; xmm5=data0+data7=tmp0
    266 
    267         movaps  xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
    268         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
    269         movaps  XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
    270         movaps  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
    271 
    272         movaps    xmm7,xmm4             ; transpose coefficients(phase 2)
    273         unpcklps2 xmm4,xmm2             ; xmm4=(20 21 22 23)=data2
    274         unpckhps2 xmm7,xmm2             ; xmm7=(30 31 32 33)=data3
    275         movaps    xmm6,xmm1             ; transpose coefficients(phase 2)
    276         unpcklps2 xmm1,xmm3             ; xmm1=(40 41 42 43)=data4
    277         unpckhps2 xmm6,xmm3             ; xmm6=(50 51 52 53)=data5
    278 
    279         movaps  xmm2,xmm7
    280         movaps  xmm3,xmm4
    281         addps   xmm7,xmm1               ; xmm7=data3+data4=tmp3
    282         addps   xmm4,xmm6               ; xmm4=data2+data5=tmp2
    283         subps   xmm2,xmm1               ; xmm2=data3-data4=tmp4
    284         subps   xmm3,xmm6               ; xmm3=data2-data5=tmp5
    285 
    286         ; -- Even part
    287 
    288         movaps  xmm1,xmm5
    289         movaps  xmm6,xmm0
    290         subps   xmm5,xmm7               ; xmm5=tmp13
    291         subps   xmm0,xmm4               ; xmm0=tmp12
    292         addps   xmm1,xmm7               ; xmm1=tmp10
    293         addps   xmm6,xmm4               ; xmm6=tmp11
    294 
    295         addps   xmm0,xmm5
    296         mulps   xmm0,[rel PD_0_707] ; xmm0=z1
    297 
    298         movaps  xmm7,xmm1
    299         movaps  xmm4,xmm5
    300         subps   xmm1,xmm6               ; xmm1=data4
    301         subps   xmm5,xmm0               ; xmm5=data6
    302         addps   xmm7,xmm6               ; xmm7=data0
    303         addps   xmm4,xmm0               ; xmm4=data2
    304 
    305         movaps  XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
    306         movaps  XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
    307         movaps  XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    308         movaps  XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    309 
    310         ; -- Odd part
    311 
    312         movaps  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
    313         movaps  xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
    314 
    315         addps   xmm2,xmm3               ; xmm2=tmp10
    316         addps   xmm3,xmm6               ; xmm3=tmp11
    317         addps   xmm6,xmm0               ; xmm6=tmp12, xmm0=tmp7
    318 
    319         mulps   xmm3,[rel PD_0_707] ; xmm3=z3
    320 
    321         movaps  xmm1,xmm2               ; xmm1=tmp10
    322         subps   xmm2,xmm6
    323         mulps   xmm2,[rel PD_0_382] ; xmm2=z5
    324         mulps   xmm1,[rel PD_0_541] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
    325         mulps   xmm6,[rel PD_1_306] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
    326         addps   xmm1,xmm2               ; xmm1=z2
    327         addps   xmm6,xmm2               ; xmm6=z4
    328 
    329         movaps  xmm5,xmm0
    330         subps   xmm0,xmm3               ; xmm0=z13
    331         addps   xmm5,xmm3               ; xmm5=z11
    332 
    333         movaps  xmm7,xmm0
    334         movaps  xmm4,xmm5
    335         subps   xmm0,xmm1               ; xmm0=data3
    336         subps   xmm5,xmm6               ; xmm5=data7
    337         addps   xmm7,xmm1               ; xmm7=data5
    338         addps   xmm4,xmm6               ; xmm4=data1
    339 
    340         movaps  XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
    341         movaps  XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
    342         movaps  XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    343         movaps  XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    344 
    345         add     rdx, byte 4*SIZEOF_FAST_FLOAT
    346         dec     rcx
    347         jnz     near .columnloop
    348 
    349         uncollect_args
    350         mov     rsp,rbp         ; rsp <- aligned rbp
    351         pop     rsp             ; rsp <- original rbp
    352         pop     rbp
    353         ret
    354 
    355 ; For some reason, the OS X linker does not honor the request to align the
    356 ; segment unless we do this.
    357         align   16
    358