Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 %include "jdct.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23         SECTION SEG_TEXT
     24         BITS    64
     25 ;
     26 ; Load data into workspace, applying unsigned->signed conversion
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
     30 ;                      DCTELEM *workspace);
     31 ;
     32 
     33 ; r10 = JSAMPARRAY sample_data
     34 ; r11 = JDIMENSION start_col
     35 ; r12 = DCTELEM *workspace
     36 
     37         align   16
     38         global  EXTN(jsimd_convsamp_sse2)
     39 
     40 EXTN(jsimd_convsamp_sse2):
     41         push    rbp
     42         mov     rax,rsp
     43         mov     rbp,rsp
     44         collect_args
     45         push    rbx
     46 
     47         pxor    xmm6,xmm6               ; xmm6=(all 0's)
     48         pcmpeqw xmm7,xmm7
     49         psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     50 
     51         mov rsi, r10
     52         mov eax, r11d
     53         mov rdi, r12
     54         mov     rcx, DCTSIZE/4
     55 .convloop:
     56         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     57         mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
     58 
     59         movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
     60         movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
     61 
     62         mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     63         mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
     64 
     65         movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
     66         movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
     67 
     68         punpcklbw xmm0,xmm6             ; xmm0=(01234567)
     69         punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
     70         paddw     xmm0,xmm7
     71         paddw     xmm1,xmm7
     72         punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
     73         punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
     74         paddw     xmm2,xmm7
     75         paddw     xmm3,xmm7
     76 
     77         movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
     78         movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
     79         movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
     80         movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
     81 
     82         add     rsi, byte 4*SIZEOF_JSAMPROW
     83         add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
     84         dec     rcx
     85         jnz     short .convloop
     86 
     87         pop     rbx
     88         uncollect_args
     89         pop     rbp
     90         ret
     91 
     92 ; --------------------------------------------------------------------------
     93 ;
     94 ; Quantize/descale the coefficients, and store into coef_block
     95 ;
     96 ; This implementation is based on an algorithm described in
     97 ;   "How to optimize for the Pentium family of microprocessors"
     98 ;   (http://www.agner.org/assem/).
     99 ;
    100 ; GLOBAL(void)
    101 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
    102 ;                      DCTELEM *workspace);
    103 ;
    104 
    105 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
    106 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
    107 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
    108 
    109 ; r10 = JCOEFPTR coef_block
    110 ; r11 = DCTELEM *divisors
    111 ; r12 = DCTELEM *workspace
    112 
    113         align   16
    114         global  EXTN(jsimd_quantize_sse2)
    115 
    116 EXTN(jsimd_quantize_sse2):
    117         push    rbp
    118         mov     rax,rsp
    119         mov     rbp,rsp
    120         collect_args
    121 
    122         mov rsi, r12
    123         mov rdx, r11
    124         mov rdi, r10
    125         mov     rax, DCTSIZE2/32
    126 .quantloop:
    127         movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
    128         movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
    129         movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
    130         movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
    131         movdqa  xmm0,xmm4
    132         movdqa  xmm1,xmm5
    133         movdqa  xmm2,xmm6
    134         movdqa  xmm3,xmm7
    135         psraw   xmm4,(WORD_BIT-1)
    136         psraw   xmm5,(WORD_BIT-1)
    137         psraw   xmm6,(WORD_BIT-1)
    138         psraw   xmm7,(WORD_BIT-1)
    139         pxor    xmm0,xmm4
    140         pxor    xmm1,xmm5
    141         pxor    xmm2,xmm6
    142         pxor    xmm3,xmm7
    143         psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
    144         psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
    145         psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
    146         psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
    147 
    148         paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
    149         paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
    150         paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
    151         paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
    152         pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
    153         pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
    154         pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
    155         pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
    156         pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
    157         pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
    158         pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
    159         pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
    160 
    161         pxor    xmm0,xmm4
    162         pxor    xmm1,xmm5
    163         pxor    xmm2,xmm6
    164         pxor    xmm3,xmm7
    165         psubw   xmm0,xmm4
    166         psubw   xmm1,xmm5
    167         psubw   xmm2,xmm6
    168         psubw   xmm3,xmm7
    169         movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
    170         movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
    171         movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
    172         movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
    173 
    174         add     rsi, byte 32*SIZEOF_DCTELEM
    175         add     rdx, byte 32*SIZEOF_DCTELEM
    176         add     rdi, byte 32*SIZEOF_JCOEF
    177         dec     rax
    178         jnz     near .quantloop
    179 
    180         uncollect_args
    181         pop     rbp
    182         ret
    183 
    184 ; For some reason, the OS X linker does not honor the request to align the
    185 ; segment unless we do this.
    186         align   16
    187