Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jquant.asm - sample data conversion and quantization (SSE & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; [TAB8]
     17 
     18 %include "jsimdext.inc"
     19 %include "jdct.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_TEXT
     23         BITS    32
     24 ;
     25 ; Load data into workspace, applying unsigned->signed conversion
     26 ;
     27 ; GLOBAL(void)
     28 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
     29 ;                           FAST_FLOAT *workspace);
     30 ;
     31 
     32 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
     33 %define start_col       ebp+12          ; JDIMENSION start_col
     34 %define workspace       ebp+16          ; FAST_FLOAT *workspace
     35 
     36         align   16
     37         global  EXTN(jsimd_convsamp_float_sse)
     38 
     39 EXTN(jsimd_convsamp_float_sse):
     40         push    ebp
     41         mov     ebp,esp
     42         push    ebx
     43 ;       push    ecx             ; need not be preserved
     44 ;       push    edx             ; need not be preserved
     45         push    esi
     46         push    edi
     47 
     48         pcmpeqw  mm7,mm7
     49         psllw    mm7,7
     50         packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
     51 
     52         mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
     53         mov     eax, JDIMENSION [start_col]
     54         mov     edi, POINTER [workspace]        ; (DCTELEM *)
     55         mov     ecx, DCTSIZE/2
     56         alignx  16,7
     57 .convloop:
     58         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     59         mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     60 
     61         movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     62         movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
     63 
     64         psubb   mm0,mm7                         ; mm0=(01234567)
     65         psubb   mm1,mm7                         ; mm1=(89ABCDEF)
     66 
     67         punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
     68         punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
     69         punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
     70         punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
     71 
     72         punpcklwd mm4,mm2                       ; mm4=(***0***1)
     73         punpckhwd mm2,mm2                       ; mm2=(***2***3)
     74         punpcklwd mm5,mm0                       ; mm5=(***4***5)
     75         punpckhwd mm0,mm0                       ; mm0=(***6***7)
     76 
     77         psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
     78         psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
     79         cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
     80         cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
     81         psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
     82         psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
     83         cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
     84         cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
     85 
     86         punpcklwd mm6,mm3                       ; mm6=(***8***9)
     87         punpckhwd mm3,mm3                       ; mm3=(***A***B)
     88         punpcklwd mm4,mm1                       ; mm4=(***C***D)
     89         punpckhwd mm1,mm1                       ; mm1=(***E***F)
     90 
     91         psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
     92         psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
     93         cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
     94         cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
     95         psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
     96         psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
     97         cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
     98         cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
     99 
    100         movlhps   xmm0,xmm1                     ; xmm0=(0123)
    101         movlhps   xmm2,xmm3                     ; xmm2=(4567)
    102         movlhps   xmm4,xmm5                     ; xmm4=(89AB)
    103         movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
    104 
    105         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    106         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    107         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
    108         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    109 
    110         add     esi, byte 2*SIZEOF_JSAMPROW
    111         add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
    112         dec     ecx
    113         jnz     near .convloop
    114 
    115         emms            ; empty MMX state
    116 
    117         pop     edi
    118         pop     esi
    119 ;       pop     edx             ; need not be preserved
    120 ;       pop     ecx             ; need not be preserved
    121         pop     ebx
    122         pop     ebp
    123         ret
    124 
    125 
    126 ; --------------------------------------------------------------------------
    127 ;
    128 ; Quantize/descale the coefficients, and store into coef_block
    129 ;
    130 ; GLOBAL(void)
    131 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT *divisors,
    132 ;                           FAST_FLOAT *workspace);
    133 ;
    134 
    135 %define coef_block      ebp+8           ; JCOEFPTR coef_block
    136 %define divisors        ebp+12          ; FAST_FLOAT *divisors
    137 %define workspace       ebp+16          ; FAST_FLOAT *workspace
    138 
    139         align   16
    140         global  EXTN(jsimd_quantize_float_sse)
    141 
    142 EXTN(jsimd_quantize_float_sse):
    143         push    ebp
    144         mov     ebp,esp
    145 ;       push    ebx             ; unused
    146 ;       push    ecx             ; unused
    147 ;       push    edx             ; need not be preserved
    148         push    esi
    149         push    edi
    150 
    151         mov     esi, POINTER [workspace]
    152         mov     edx, POINTER [divisors]
    153         mov     edi, JCOEFPTR [coef_block]
    154         mov     eax, DCTSIZE2/16
    155         alignx  16,7
    156 .quantloop:
    157         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    158         movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
    159         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    160         mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
    161         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    162         movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
    163         mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    164         mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    165 
    166         movhlps  xmm4,xmm0
    167         movhlps  xmm5,xmm1
    168 
    169         cvtps2pi mm0,xmm0
    170         cvtps2pi mm1,xmm1
    171         cvtps2pi mm4,xmm4
    172         cvtps2pi mm5,xmm5
    173 
    174         movhlps  xmm6,xmm2
    175         movhlps  xmm7,xmm3
    176 
    177         cvtps2pi mm2,xmm2
    178         cvtps2pi mm3,xmm3
    179         cvtps2pi mm6,xmm6
    180         cvtps2pi mm7,xmm7
    181 
    182         packssdw mm0,mm4
    183         packssdw mm1,mm5
    184         packssdw mm2,mm6
    185         packssdw mm3,mm7
    186 
    187         movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
    188         movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
    189         movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
    190         movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
    191 
    192         add     esi, byte 16*SIZEOF_FAST_FLOAT
    193         add     edx, byte 16*SIZEOF_FAST_FLOAT
    194         add     edi, byte 16*SIZEOF_JCOEF
    195         dec     eax
    196         jnz     short .quantloop
    197 
    198         emms            ; empty MMX state
    199 
    200         pop     edi
    201         pop     esi
    202 ;       pop     edx             ; need not be preserved
    203 ;       pop     ecx             ; unused
    204 ;       pop     ebx             ; unused
    205         pop     ebp
    206         ret
    207 
    208 ; For some reason, the OS X linker does not honor the request to align the
    209 ; segment unless we do this.
    210         align   16
    211