Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jquant.asm - sample data conversion and quantization (SSE & MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 %include "jdct.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23         SECTION SEG_TEXT
     24         BITS    32
     25 ;
     26 ; Load data into workspace, applying unsigned->signed conversion
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
     30 ;                           FAST_FLOAT * workspace);
     31 ;
     32 
     33 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
     34 %define start_col       ebp+12          ; JDIMENSION start_col
     35 %define workspace       ebp+16          ; FAST_FLOAT * workspace
     36 
     37         align   16
     38         global  EXTN(jsimd_convsamp_float_sse)
     39 
     40 EXTN(jsimd_convsamp_float_sse):
     41         push    ebp
     42         mov     ebp,esp
     43         push    ebx
     44 ;       push    ecx             ; need not be preserved
     45 ;       push    edx             ; need not be preserved
     46         push    esi
     47         push    edi
     48 
     49         pcmpeqw  mm7,mm7
     50         psllw    mm7,7
     51         packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
     52 
     53         mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
     54         mov     eax, JDIMENSION [start_col]
     55         mov     edi, POINTER [workspace]        ; (DCTELEM *)
     56         mov     ecx, DCTSIZE/2
     57         alignx  16,7
     58 .convloop:
     59         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     60         mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     61 
     62         movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     63         movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
     64 
     65         psubb   mm0,mm7                         ; mm0=(01234567)
     66         psubb   mm1,mm7                         ; mm1=(89ABCDEF)
     67 
     68         punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
     69         punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
     70         punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
     71         punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
     72 
     73         punpcklwd mm4,mm2                       ; mm4=(***0***1)
     74         punpckhwd mm2,mm2                       ; mm2=(***2***3)
     75         punpcklwd mm5,mm0                       ; mm5=(***4***5)
     76         punpckhwd mm0,mm0                       ; mm0=(***6***7)
     77 
     78         psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
     79         psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
     80         cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
     81         cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
     82         psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
     83         psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
     84         cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
     85         cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
     86 
     87         punpcklwd mm6,mm3                       ; mm6=(***8***9)
     88         punpckhwd mm3,mm3                       ; mm3=(***A***B)
     89         punpcklwd mm4,mm1                       ; mm4=(***C***D)
     90         punpckhwd mm1,mm1                       ; mm1=(***E***F)
     91 
     92         psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
     93         psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
     94         cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
     95         cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
     96         psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
     97         psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
     98         cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
     99         cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
    100 
    101         movlhps   xmm0,xmm1                     ; xmm0=(0123)
    102         movlhps   xmm2,xmm3                     ; xmm2=(4567)
    103         movlhps   xmm4,xmm5                     ; xmm4=(89AB)
    104         movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
    105 
    106         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
    107         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
    108         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
    109         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
    110 
    111         add     esi, byte 2*SIZEOF_JSAMPROW
    112         add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
    113         dec     ecx
    114         jnz     near .convloop
    115 
    116         emms            ; empty MMX state
    117 
    118         pop     edi
    119         pop     esi
    120 ;       pop     edx             ; need not be preserved
    121 ;       pop     ecx             ; need not be preserved
    122         pop     ebx
    123         pop     ebp
    124         ret
    125 
    126 
    127 ; --------------------------------------------------------------------------
    128 ;
    129 ; Quantize/descale the coefficients, and store into coef_block
    130 ;
    131 ; GLOBAL(void)
    132 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
    133 ;                           FAST_FLOAT * workspace);
    134 ;
    135 
    136 %define coef_block      ebp+8           ; JCOEFPTR coef_block
    137 %define divisors        ebp+12          ; FAST_FLOAT * divisors
    138 %define workspace       ebp+16          ; FAST_FLOAT * workspace
    139 
    140         align   16
    141         global  EXTN(jsimd_quantize_float_sse)
    142 
    143 EXTN(jsimd_quantize_float_sse):
    144         push    ebp
    145         mov     ebp,esp
    146 ;       push    ebx             ; unused
    147 ;       push    ecx             ; unused
    148 ;       push    edx             ; need not be preserved
    149         push    esi
    150         push    edi
    151 
    152         mov     esi, POINTER [workspace]
    153         mov     edx, POINTER [divisors]
    154         mov     edi, JCOEFPTR [coef_block]
    155         mov     eax, DCTSIZE2/16
    156         alignx  16,7
    157 .quantloop:
    158         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
    159         movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
    160         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    161         mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
    162         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
    163         movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
    164         mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    165         mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
    166 
    167         movhlps  xmm4,xmm0
    168         movhlps  xmm5,xmm1
    169 
    170         cvtps2pi mm0,xmm0
    171         cvtps2pi mm1,xmm1
    172         cvtps2pi mm4,xmm4
    173         cvtps2pi mm5,xmm5
    174 
    175         movhlps  xmm6,xmm2
    176         movhlps  xmm7,xmm3
    177 
    178         cvtps2pi mm2,xmm2
    179         cvtps2pi mm3,xmm3
    180         cvtps2pi mm6,xmm6
    181         cvtps2pi mm7,xmm7
    182 
    183         packssdw mm0,mm4
    184         packssdw mm1,mm5
    185         packssdw mm2,mm6
    186         packssdw mm3,mm7
    187 
    188         movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
    189         movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
    190         movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
    191         movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
    192 
    193         add     esi, byte 16*SIZEOF_FAST_FLOAT
    194         add     edx, byte 16*SIZEOF_FAST_FLOAT
    195         add     edi, byte 16*SIZEOF_JCOEF
    196         dec     eax
    197         jnz     short .quantloop
    198 
    199         emms            ; empty MMX state
    200 
    201         pop     edi
    202         pop     esi
    203 ;       pop     edx             ; need not be preserved
    204 ;       pop     ecx             ; unused
    205 ;       pop     ebx             ; unused
    206         pop     ebp
    207         ret
    208 
    209 ; For some reason, the OS X linker does not honor the request to align the
    210 ; segment unless we do this.
    211         align   16
    212