Home | History | Annotate | Download | only in x86_64
      1 ;
      2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; This file contains a floating-point implementation of the forward DCT
     18 ; (Discrete Cosine Transform). The following code is based directly on
     19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
     20 ;
     21 ; [TAB8]
     22 
     23 %include "jsimdext.inc"
     24 %include "jdct.inc"
     25 
     26 ; --------------------------------------------------------------------------
     27 
     28 %macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
     29     shufps      %1, %2, 0x44
     30 %endmacro
     31 
     32 %macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
     33     shufps      %1, %2, 0xEE
     34 %endmacro
     35 
     36 ; --------------------------------------------------------------------------
     37     SECTION     SEG_CONST
     38 
     39     alignz      32
     40     GLOBAL_DATA(jconst_fdct_float_sse)
     41 
     42 EXTN(jconst_fdct_float_sse):
     43 
     44 PD_0_382 times 4 dd 0.382683432365089771728460
     45 PD_0_707 times 4 dd 0.707106781186547524400844
     46 PD_0_541 times 4 dd 0.541196100146196984399723
     47 PD_1_306 times 4 dd 1.306562964876376527856643
     48 
     49     alignz      32
     50 
     51 ; --------------------------------------------------------------------------
     52     SECTION     SEG_TEXT
     53     BITS        64
     54 ;
     55 ; Perform the forward DCT on one block of samples.
     56 ;
     57 ; GLOBAL(void)
     58 ; jsimd_fdct_float_sse(FAST_FLOAT *data)
     59 ;
     60 
     61 ; r10 = FAST_FLOAT *data
     62 
     63 %define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
     64 %define WK_NUM  2
     65 
     66     align       32
     67     GLOBAL_FUNCTION(jsimd_fdct_float_sse)
     68 
     69 EXTN(jsimd_fdct_float_sse):
     70     push        rbp
     71     mov         rax, rsp                     ; rax = original rbp
     72     sub         rsp, byte 4
     73     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
     74     mov         [rsp], rax
     75     mov         rbp, rsp                     ; rbp = aligned rbp
     76     lea         rsp, [wk(0)]
     77     collect_args 1
     78 
     79     ; ---- Pass 1: process rows.
     80 
     81     mov         rdx, r10                ; (FAST_FLOAT *)
     82     mov         rcx, DCTSIZE/4
     83 .rowloop:
     84 
     85     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
     86     movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
     87     movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
     88     movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
     89 
     90     ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
     91     ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
     92 
     93     movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
     94     unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
     95     unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
     96     movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
     97     unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
     98     unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
     99 
    100     movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
    101     movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
    102     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
    103     movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
    104 
    105     ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
    106     ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
    107 
    108     movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
    109     movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
    110 
    111     movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
    112     unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
    113     unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
    114     movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
    115     unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
    116     unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
    117 
    118     movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
    119     unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
    120     unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
    121     movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
    122     unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
    123     unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
    124 
    125     movaps      xmm0, xmm7
    126     movaps      xmm5, xmm6
    127     subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
    128     subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
    129     addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
    130     addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
    131 
    132     movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
    133     movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
    134     movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
    135     movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
    136 
    137     movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
    138     unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
    139     unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
    140     movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
    141     unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
    142     unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
    143 
    144     movaps      xmm2, xmm7
    145     movaps      xmm3, xmm4
    146     addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
    147     addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
    148     subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
    149     subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
    150 
    151     ; -- Even part
    152 
    153     movaps      xmm1, xmm5
    154     movaps      xmm6, xmm0
    155     subps       xmm5, xmm7              ; xmm5=tmp13
    156     subps       xmm0, xmm4              ; xmm0=tmp12
    157     addps       xmm1, xmm7              ; xmm1=tmp10
    158     addps       xmm6, xmm4              ; xmm6=tmp11
    159 
    160     addps       xmm0, xmm5
    161     mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
    162 
    163     movaps      xmm7, xmm1
    164     movaps      xmm4, xmm5
    165     subps       xmm1, xmm6              ; xmm1=data4
    166     subps       xmm5, xmm0              ; xmm5=data6
    167     addps       xmm7, xmm6              ; xmm7=data0
    168     addps       xmm4, xmm0              ; xmm4=data2
    169 
    170     movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
    171     movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
    172     movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    173     movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    174 
    175     ; -- Odd part
    176 
    177     movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
    178     movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
    179 
    180     addps       xmm2, xmm3              ; xmm2=tmp10
    181     addps       xmm3, xmm6              ; xmm3=tmp11
    182     addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
    183 
    184     mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
    185 
    186     movaps      xmm1, xmm2              ; xmm1=tmp10
    187     subps       xmm2, xmm6
    188     mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
    189     mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
    190     mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
    191     addps       xmm1, xmm2              ; xmm1=z2
    192     addps       xmm6, xmm2              ; xmm6=z4
    193 
    194     movaps      xmm5, xmm0
    195     subps       xmm0, xmm3              ; xmm0=z13
    196     addps       xmm5, xmm3              ; xmm5=z11
    197 
    198     movaps      xmm7, xmm0
    199     movaps      xmm4, xmm5
    200     subps       xmm0, xmm1              ; xmm0=data3
    201     subps       xmm5, xmm6              ; xmm5=data7
    202     addps       xmm7, xmm1              ; xmm7=data5
    203     addps       xmm4, xmm6              ; xmm4=data1
    204 
    205     movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
    206     movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
    207     movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
    208     movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    209 
    210     add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
    211     dec         rcx
    212     jnz         near .rowloop
    213 
    214     ; ---- Pass 2: process columns.
    215 
    216     mov         rdx, r10                ; (FAST_FLOAT *)
    217     mov         rcx, DCTSIZE/4
    218 .columnloop:
    219 
    220     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
    221     movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
    222     movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
    223     movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
    224 
    225     ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
    226     ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
    227 
    228     movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
    229     unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
    230     unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
    231     movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
    232     unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
    233     unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
    234 
    235     movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
    236     movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
    237     movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
    238     movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
    239 
    240     ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
    241     ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
    242 
    243     movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
    244     movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
    245 
    246     movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
    247     unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
    248     unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
    249     movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
    250     unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
    251     unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
    252 
    253     movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
    254     unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
    255     unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
    256     movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
    257     unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
    258     unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
    259 
    260     movaps      xmm0, xmm7
    261     movaps      xmm5, xmm6
    262     subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
    263     subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
    264     addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
    265     addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
    266 
    267     movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
    268     movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
    269     movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
    270     movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
    271 
    272     movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
    273     unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
    274     unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
    275     movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
    276     unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
    277     unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
    278 
    279     movaps      xmm2, xmm7
    280     movaps      xmm3, xmm4
    281     addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
    282     addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
    283     subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
    284     subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
    285 
    286     ; -- Even part
    287 
    288     movaps      xmm1, xmm5
    289     movaps      xmm6, xmm0
    290     subps       xmm5, xmm7              ; xmm5=tmp13
    291     subps       xmm0, xmm4              ; xmm0=tmp12
    292     addps       xmm1, xmm7              ; xmm1=tmp10
    293     addps       xmm6, xmm4              ; xmm6=tmp11
    294 
    295     addps       xmm0, xmm5
    296     mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
    297 
    298     movaps      xmm7, xmm1
    299     movaps      xmm4, xmm5
    300     subps       xmm1, xmm6              ; xmm1=data4
    301     subps       xmm5, xmm0              ; xmm5=data6
    302     addps       xmm7, xmm6              ; xmm7=data0
    303     addps       xmm4, xmm0              ; xmm4=data2
    304 
    305     movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
    306     movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
    307     movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    308     movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    309 
    310     ; -- Odd part
    311 
    312     movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
    313     movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
    314 
    315     addps       xmm2, xmm3              ; xmm2=tmp10
    316     addps       xmm3, xmm6              ; xmm3=tmp11
    317     addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
    318 
    319     mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
    320 
    321     movaps      xmm1, xmm2              ; xmm1=tmp10
    322     subps       xmm2, xmm6
    323     mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
    324     mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
    325     mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
    326     addps       xmm1, xmm2              ; xmm1=z2
    327     addps       xmm6, xmm2              ; xmm6=z4
    328 
    329     movaps      xmm5, xmm0
    330     subps       xmm0, xmm3              ; xmm0=z13
    331     addps       xmm5, xmm3              ; xmm5=z11
    332 
    333     movaps      xmm7, xmm0
    334     movaps      xmm4, xmm5
    335     subps       xmm0, xmm1              ; xmm0=data3
    336     subps       xmm5, xmm6              ; xmm5=data7
    337     addps       xmm7, xmm1              ; xmm7=data5
    338     addps       xmm4, xmm6              ; xmm4=data1
    339 
    340     movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
    341     movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
    342     movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
    343     movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
    344 
    345     add         rdx, byte 4*SIZEOF_FAST_FLOAT
    346     dec         rcx
    347     jnz         near .columnloop
    348 
    349     uncollect_args 1
    350     mov         rsp, rbp                ; rsp <- aligned rbp
    351     pop         rsp                     ; rsp <- original rbp
    352     pop         rbp
    353     ret
    354 
    355 ; For some reason, the OS X linker does not honor the request to align the
    356 ; segment unless we do this.
    357     align       32
    358