Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jquant.asm - sample data conversion and quantization (SSE & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 %include "jdct.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23     SECTION     SEG_TEXT
     24     BITS        32
     25 ;
     26 ; Load data into workspace, applying unsigned->signed conversion
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_convsamp_float_sse(JSAMPARRAY sample_data, JDIMENSION start_col,
     30 ;                          FAST_FLOAT *workspace);
     31 ;
     32 
     33 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     34 %define start_col    ebp + 12           ; JDIMENSION start_col
     35 %define workspace    ebp + 16           ; FAST_FLOAT *workspace
     36 
     37     align       32
     38     GLOBAL_FUNCTION(jsimd_convsamp_float_sse)
     39 
     40 EXTN(jsimd_convsamp_float_sse):
     41     push        ebp
     42     mov         ebp, esp
     43     push        ebx
     44 ;   push        ecx                     ; need not be preserved
     45 ;   push        edx                     ; need not be preserved
     46     push        esi
     47     push        edi
     48 
     49     pcmpeqw     mm7, mm7
     50     psllw       mm7, 7
     51     packsswb    mm7, mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
     52 
     53     mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     54     mov         eax, JDIMENSION [start_col]
     55     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     56     mov         ecx, DCTSIZE/2
     57     alignx      16, 7
     58 .convloop:
     59     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61 
     62     movq        mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     63     movq        mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
     64 
     65     psubb       mm0, mm7                ; mm0=(01234567)
     66     psubb       mm1, mm7                ; mm1=(89ABCDEF)
     67 
     68     punpcklbw   mm2, mm0                ; mm2=(*0*1*2*3)
     69     punpckhbw   mm0, mm0                ; mm0=(*4*5*6*7)
     70     punpcklbw   mm3, mm1                ; mm3=(*8*9*A*B)
     71     punpckhbw   mm1, mm1                ; mm1=(*C*D*E*F)
     72 
     73     punpcklwd   mm4, mm2                ; mm4=(***0***1)
     74     punpckhwd   mm2, mm2                ; mm2=(***2***3)
     75     punpcklwd   mm5, mm0                ; mm5=(***4***5)
     76     punpckhwd   mm0, mm0                ; mm0=(***6***7)
     77 
     78     psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(01)
     79     psrad       mm2, (DWORD_BIT-BYTE_BIT)  ; mm2=(23)
     80     cvtpi2ps    xmm0, mm4                  ; xmm0=(01**)
     81     cvtpi2ps    xmm1, mm2                  ; xmm1=(23**)
     82     psrad       mm5, (DWORD_BIT-BYTE_BIT)  ; mm5=(45)
     83     psrad       mm0, (DWORD_BIT-BYTE_BIT)  ; mm0=(67)
     84     cvtpi2ps    xmm2, mm5                  ; xmm2=(45**)
     85     cvtpi2ps    xmm3, mm0                  ; xmm3=(67**)
     86 
     87     punpcklwd   mm6, mm3                ; mm6=(***8***9)
     88     punpckhwd   mm3, mm3                ; mm3=(***A***B)
     89     punpcklwd   mm4, mm1                ; mm4=(***C***D)
     90     punpckhwd   mm1, mm1                ; mm1=(***E***F)
     91 
     92     psrad       mm6, (DWORD_BIT-BYTE_BIT)  ; mm6=(89)
     93     psrad       mm3, (DWORD_BIT-BYTE_BIT)  ; mm3=(AB)
     94     cvtpi2ps    xmm4, mm6                  ; xmm4=(89**)
     95     cvtpi2ps    xmm5, mm3                  ; xmm5=(AB**)
     96     psrad       mm4, (DWORD_BIT-BYTE_BIT)  ; mm4=(CD)
     97     psrad       mm1, (DWORD_BIT-BYTE_BIT)  ; mm1=(EF)
     98     cvtpi2ps    xmm6, mm4                  ; xmm6=(CD**)
     99     cvtpi2ps    xmm7, mm1                  ; xmm7=(EF**)
    100 
    101     movlhps     xmm0, xmm1              ; xmm0=(0123)
    102     movlhps     xmm2, xmm3              ; xmm2=(4567)
    103     movlhps     xmm4, xmm5              ; xmm4=(89AB)
    104     movlhps     xmm6, xmm7              ; xmm6=(CDEF)
    105 
    106     movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    107     movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    108     movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
    109     movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    110 
    111     add         esi, byte 2*SIZEOF_JSAMPROW
    112     add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
    113     dec         ecx
    114     jnz         near .convloop
    115 
    116     emms                                ; empty MMX state
    117 
    118     pop         edi
    119     pop         esi
    120 ;   pop         edx                     ; need not be preserved
    121 ;   pop         ecx                     ; need not be preserved
    122     pop         ebx
    123     pop         ebp
    124     ret
    125 
    126 ; --------------------------------------------------------------------------
    127 ;
    128 ; Quantize/descale the coefficients, and store into coef_block
    129 ;
    130 ; GLOBAL(void)
    131 ; jsimd_quantize_float_sse(JCOEFPTR coef_block, FAST_FLOAT *divisors,
    132 ;                          FAST_FLOAT *workspace);
    133 ;
    134 
    135 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    136 %define divisors    ebp + 12            ; FAST_FLOAT *divisors
    137 %define workspace   ebp + 16            ; FAST_FLOAT *workspace
    138 
    139     align       32
    140     GLOBAL_FUNCTION(jsimd_quantize_float_sse)
    141 
    142 EXTN(jsimd_quantize_float_sse):
    143     push        ebp
    144     mov         ebp, esp
    145 ;   push        ebx                     ; unused
    146 ;   push        ecx                     ; unused
    147 ;   push        edx                     ; need not be preserved
    148     push        esi
    149     push        edi
    150 
    151     mov         esi, POINTER [workspace]
    152     mov         edx, POINTER [divisors]
    153     mov         edi, JCOEFPTR [coef_block]
    154     mov         eax, DCTSIZE2/16
    155     alignx      16, 7
    156 .quantloop:
    157     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    158     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
    159     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    160     mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
    161     movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    162     movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
    163     mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    164     mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    165 
    166     movhlps     xmm4, xmm0
    167     movhlps     xmm5, xmm1
    168 
    169     cvtps2pi    mm0, xmm0
    170     cvtps2pi    mm1, xmm1
    171     cvtps2pi    mm4, xmm4
    172     cvtps2pi    mm5, xmm5
    173 
    174     movhlps     xmm6, xmm2
    175     movhlps     xmm7, xmm3
    176 
    177     cvtps2pi    mm2, xmm2
    178     cvtps2pi    mm3, xmm3
    179     cvtps2pi    mm6, xmm6
    180     cvtps2pi    mm7, xmm7
    181 
    182     packssdw    mm0, mm4
    183     packssdw    mm1, mm5
    184     packssdw    mm2, mm6
    185     packssdw    mm3, mm7
    186 
    187     movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
    188     movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
    189     movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
    190     movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
    191 
    192     add         esi, byte 16*SIZEOF_FAST_FLOAT
    193     add         edx, byte 16*SIZEOF_FAST_FLOAT
    194     add         edi, byte 16*SIZEOF_JCOEF
    195     dec         eax
    196     jnz         short .quantloop
    197 
    198     emms                                ; empty MMX state
    199 
    200     pop         edi
    201     pop         esi
    202 ;   pop         edx                     ; need not be preserved
    203 ;   pop         ecx                     ; unused
    204 ;   pop         ebx                     ; unused
    205     pop         ebp
    206     ret
    207 
    208 ; For some reason, the OS X linker does not honor the request to align the
    209 ; segment unless we do this.
    210     align       32
    211