Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jquantf.asm - sample data conversion and quantization (SSE & SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 %include "jdct.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23     SECTION     SEG_TEXT
     24     BITS        32
     25 ;
     26 ; Load data into workspace, applying unsigned->signed conversion
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
     30 ;                           FAST_FLOAT *workspace);
     31 ;
     32 
     33 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     34 %define start_col    ebp + 12           ; JDIMENSION start_col
     35 %define workspace    ebp + 16           ; FAST_FLOAT *workspace
     36 
     37     align       32
     38     GLOBAL_FUNCTION(jsimd_convsamp_float_sse2)
     39 
     40 EXTN(jsimd_convsamp_float_sse2):
     41     push        ebp
     42     mov         ebp, esp
     43     push        ebx
     44 ;   push        ecx                     ; need not be preserved
     45 ;   push        edx                     ; need not be preserved
     46     push        esi
     47     push        edi
     48 
     49     pcmpeqw     xmm7, xmm7
     50     psllw       xmm7, 7
     51     packsswb    xmm7, xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
     52 
     53     mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     54     mov         eax, JDIMENSION [start_col]
     55     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     56     mov         ecx, DCTSIZE/2
     57     alignx      16, 7
     58 .convloop:
     59     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61 
     62     movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     63     movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     64 
     65     psubb       xmm0, xmm7              ; xmm0=(01234567)
     66     psubb       xmm1, xmm7              ; xmm1=(89ABCDEF)
     67 
     68     punpcklbw   xmm0, xmm0              ; xmm0=(*0*1*2*3*4*5*6*7)
     69     punpcklbw   xmm1, xmm1              ; xmm1=(*8*9*A*B*C*D*E*F)
     70 
     71     punpcklwd   xmm2, xmm0              ; xmm2=(***0***1***2***3)
     72     punpckhwd   xmm0, xmm0              ; xmm0=(***4***5***6***7)
     73     punpcklwd   xmm3, xmm1              ; xmm3=(***8***9***A***B)
     74     punpckhwd   xmm1, xmm1              ; xmm1=(***C***D***E***F)
     75 
     76     psrad       xmm2, (DWORD_BIT-BYTE_BIT)  ; xmm2=(0123)
     77     psrad       xmm0, (DWORD_BIT-BYTE_BIT)  ; xmm0=(4567)
     78     cvtdq2ps    xmm2, xmm2                  ; xmm2=(0123)
     79     cvtdq2ps    xmm0, xmm0                  ; xmm0=(4567)
     80     psrad       xmm3, (DWORD_BIT-BYTE_BIT)  ; xmm3=(89AB)
     81     psrad       xmm1, (DWORD_BIT-BYTE_BIT)  ; xmm1=(CDEF)
     82     cvtdq2ps    xmm3, xmm3                  ; xmm3=(89AB)
     83     cvtdq2ps    xmm1, xmm1                  ; xmm1=(CDEF)
     84 
     85     movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2
     86     movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
     87     movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
     88     movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
     89 
     90     add         esi, byte 2*SIZEOF_JSAMPROW
     91     add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
     92     dec         ecx
     93     jnz         short .convloop
     94 
     95     pop         edi
     96     pop         esi
     97 ;   pop         edx                     ; need not be preserved
     98 ;   pop         ecx                     ; need not be preserved
     99     pop         ebx
    100     pop         ebp
    101     ret
    102 
    103 ; --------------------------------------------------------------------------
    104 ;
    105 ; Quantize/descale the coefficients, and store into coef_block
    106 ;
    107 ; GLOBAL(void)
    108 ; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors,
    109 ;                           FAST_FLOAT *workspace);
    110 ;
    111 
    112 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    113 %define divisors    ebp + 12            ; FAST_FLOAT *divisors
    114 %define workspace   ebp + 16            ; FAST_FLOAT *workspace
    115 
    116     align       32
    117     GLOBAL_FUNCTION(jsimd_quantize_float_sse2)
    118 
    119 EXTN(jsimd_quantize_float_sse2):
    120     push        ebp
    121     mov         ebp, esp
    122 ;   push        ebx                     ; unused
    123 ;   push        ecx                     ; unused
    124 ;   push        edx                     ; need not be preserved
    125     push        esi
    126     push        edi
    127 
    128     mov         esi, POINTER [workspace]
    129     mov         edx, POINTER [divisors]
    130     mov         edi, JCOEFPTR [coef_block]
    131     mov         eax, DCTSIZE2/16
    132     alignx      16, 7
    133 .quantloop:
    134     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    135     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
    136     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    137     mulps       xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
    138     movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    139     movaps      xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
    140     mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    141     mulps       xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    142 
    143     cvtps2dq    xmm0, xmm0
    144     cvtps2dq    xmm1, xmm1
    145     cvtps2dq    xmm2, xmm2
    146     cvtps2dq    xmm3, xmm3
    147 
    148     packssdw    xmm0, xmm1
    149     packssdw    xmm2, xmm3
    150 
    151     movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0
    152     movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2
    153 
    154     add         esi, byte 16*SIZEOF_FAST_FLOAT
    155     add         edx, byte 16*SIZEOF_FAST_FLOAT
    156     add         edi, byte 16*SIZEOF_JCOEF
    157     dec         eax
    158     jnz         short .quantloop
    159 
    160     pop         edi
    161     pop         esi
    162 ;   pop         edx                     ; need not be preserved
    163 ;   pop         ecx                     ; unused
    164 ;   pop         ebx                     ; unused
    165     pop         ebp
    166     ret
    167 
    168 ; For some reason, the OS X linker does not honor the request to align the
    169 ; segment unless we do this.
    170     align       32
    171