Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jquanti.asm - sample data conversion and quantization (AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, 2018, D. R. Commander.
      6 ; Copyright (C) 2016, Matthieu Darbois.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 %include "jdct.inc"
     22 
     23 ; --------------------------------------------------------------------------
     24     SECTION     SEG_TEXT
     25     BITS        32
     26 ;
     27 ; Load data into workspace, applying unsigned->signed conversion
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
     31 ;                     DCTELEM *workspace);
     32 ;
     33 
     34 %define sample_data  ebp + 8            ; JSAMPARRAY sample_data
     35 %define start_col    ebp + 12           ; JDIMENSION start_col
     36 %define workspace    ebp + 16           ; DCTELEM *workspace
     37 
     38     align       32
     39     GLOBAL_FUNCTION(jsimd_convsamp_avx2)
     40 
     41 EXTN(jsimd_convsamp_avx2):
     42     push        ebp
     43     mov         ebp, esp
     44     push        ebx
     45 ;   push        ecx                     ; need not be preserved
     46 ;   push        edx                     ; need not be preserved
     47     push        esi
     48     push        edi
     49 
     50     mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
     51     mov         eax, JDIMENSION [start_col]
     52     mov         edi, POINTER [workspace]       ; (DCTELEM *)
     53 
     54     mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     55     mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     56     movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     57     movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     58 
     59     mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     60     mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     61     movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     62     movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     63 
     64     mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     65     mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     66     movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     67     movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     68 
     69     mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     70     mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     71     movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
     72     movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
     73 
     74     vinserti128 ymm0, ymm0, xmm1, 1
     75     vinserti128 ymm2, ymm2, xmm3, 1
     76     vinserti128 ymm4, ymm4, xmm5, 1
     77     vinserti128 ymm6, ymm6, xmm7, 1
     78 
     79     vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
     80     vpunpcklbw  ymm0, ymm0, ymm1
     81     vpunpcklbw  ymm2, ymm2, ymm1
     82     vpunpcklbw  ymm4, ymm4, ymm1
     83     vpunpcklbw  ymm6, ymm6, ymm1
     84 
     85     vpcmpeqw    ymm7, ymm7, ymm7
     86     vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
     87 
     88     vpaddw      ymm0, ymm0, ymm7
     89     vpaddw      ymm2, ymm2, ymm7
     90     vpaddw      ymm4, ymm4, ymm7
     91     vpaddw      ymm6, ymm6, ymm7
     92 
     93     vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
     94     vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
     95     vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
     96     vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
     97 
     98     vzeroupper
     99     pop         edi
    100     pop         esi
    101 ;   pop         edx                     ; need not be preserved
    102 ;   pop         ecx                     ; need not be preserved
    103     pop         ebx
    104     pop         ebp
    105     ret
    106 
    107 ; --------------------------------------------------------------------------
    108 ;
    109 ; Quantize/descale the coefficients, and store into coef_block
    110 ;
    111 ; This implementation is based on an algorithm described in
    112 ;   "How to optimize for the Pentium family of microprocessors"
    113 ;   (http://www.agner.org/assem/).
    114 ;
    115 ; GLOBAL(void)
    116 ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
    117 ;                     DCTELEM *workspace);
    118 ;
    119 
    120 %define RECIPROCAL(m, n, b) \
    121   YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
    122 %define CORRECTION(m, n, b) \
    123   YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
    124 %define SCALE(m, n, b) \
    125   YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
    126 
    127 %define coef_block  ebp + 8             ; JCOEFPTR coef_block
    128 %define divisors    ebp + 12            ; DCTELEM *divisors
    129 %define workspace   ebp + 16            ; DCTELEM *workspace
    130 
    131     align       32
    132     GLOBAL_FUNCTION(jsimd_quantize_avx2)
    133 
    134 EXTN(jsimd_quantize_avx2):
    135     push        ebp
    136     mov         ebp, esp
    137 ;   push        ebx                     ; unused
    138 ;   push        ecx                     ; unused
    139 ;   push        edx                     ; need not be preserved
    140     push        esi
    141     push        edi
    142 
    143     mov         esi, POINTER [workspace]
    144     mov         edx, POINTER [divisors]
    145     mov         edi, JCOEFPTR [coef_block]
    146 
    147     vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
    148     vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
    149     vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
    150     vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
    151     vpabsw      ymm0, ymm4
    152     vpabsw      ymm1, ymm5
    153     vpabsw      ymm2, ymm6
    154     vpabsw      ymm3, ymm7
    155 
    156     vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
    157     vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
    158     vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
    159     vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
    160     vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
    161     vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
    162     vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
    163     vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
    164     vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
    165     vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
    166     vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
    167     vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
    168 
    169     vpsignw     ymm0, ymm0, ymm4
    170     vpsignw     ymm1, ymm1, ymm5
    171     vpsignw     ymm2, ymm2, ymm6
    172     vpsignw     ymm3, ymm3, ymm7
    173 
    174     vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
    175     vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
    176     vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
    177     vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
    178 
    179     vzeroupper
    180     pop         edi
    181     pop         esi
    182 ;   pop         edx                     ; need not be preserved
    183 ;   pop         ecx                     ; unused
    184 ;   pop         ebx                     ; unused
    185     pop         ebp
    186     ret
    187 
    188 ; For some reason, the OS X linker does not honor the request to align the
    189 ; segment unless we do this.
    190     align       32
    191