Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; [TAB8]
     17 
     18 %include "jsimdext.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21         SECTION SEG_TEXT
     22         BITS    32
     23 ;
     24 ; Downsample pixel values of a single component.
     25 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     26 ; without smoothing.
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     30 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     31 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     32 ;
     33 
     34 %define img_width(b)    (b)+8           ; JDIMENSION image_width
     35 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
     36 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
     37 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
     38 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
     39 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
     40 
     41         align   16
     42         global  EXTN(jsimd_h2v1_downsample_sse2)
     43 
     44 EXTN(jsimd_h2v1_downsample_sse2):
     45         push    ebp
     46         mov     ebp,esp
     47 ;       push    ebx             ; unused
     48 ;       push    ecx             ; need not be preserved
     49 ;       push    edx             ; need not be preserved
     50         push    esi
     51         push    edi
     52 
     53         mov     ecx, JDIMENSION [width_blks(ebp)]
     54         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
     55         jz      near .return
     56 
     57         mov     edx, JDIMENSION [img_width(ebp)]
     58 
     59         ; -- expand_right_edge
     60 
     61         push    ecx
     62         shl     ecx,1                           ; output_cols * 2
     63         sub     ecx,edx
     64         jle     short .expand_end
     65 
     66         mov     eax, INT [max_v_samp(ebp)]
     67         test    eax,eax
     68         jle     short .expand_end
     69 
     70         cld
     71         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     72         alignx  16,7
     73 .expandloop:
     74         push    eax
     75         push    ecx
     76 
     77         mov     edi, JSAMPROW [esi]
     78         add     edi,edx
     79         mov     al, JSAMPLE [edi-1]
     80 
     81         rep stosb
     82 
     83         pop     ecx
     84         pop     eax
     85 
     86         add     esi, byte SIZEOF_JSAMPROW
     87         dec     eax
     88         jg      short .expandloop
     89 
     90 .expand_end:
     91         pop     ecx                             ; output_cols
     92 
     93         ; -- h2v1_downsample
     94 
     95         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
     96         test    eax,eax
     97         jle     near .return
     98 
     99         mov     edx, 0x00010000         ; bias pattern
    100         movd    xmm7,edx
    101         pcmpeqw xmm6,xmm6
    102         pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    103         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    104 
    105         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    106         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    107         alignx  16,7
    108 .rowloop:
    109         push    ecx
    110         push    edi
    111         push    esi
    112 
    113         mov     esi, JSAMPROW [esi]             ; inptr
    114         mov     edi, JSAMPROW [edi]             ; outptr
    115 
    116         cmp     ecx, byte SIZEOF_XMMWORD
    117         jae     short .columnloop
    118         alignx  16,7
    119 
    120 .columnloop_r8:
    121         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    122         pxor    xmm1,xmm1
    123         mov     ecx, SIZEOF_XMMWORD
    124         jmp     short .downsample
    125         alignx  16,7
    126 
    127 .columnloop:
    128         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    129         movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
    130 
    131 .downsample:
    132         movdqa  xmm2,xmm0
    133         movdqa  xmm3,xmm1
    134 
    135         pand    xmm0,xmm6
    136         psrlw   xmm2,BYTE_BIT
    137         pand    xmm1,xmm6
    138         psrlw   xmm3,BYTE_BIT
    139 
    140         paddw   xmm0,xmm2
    141         paddw   xmm1,xmm3
    142         paddw   xmm0,xmm7
    143         paddw   xmm1,xmm7
    144         psrlw   xmm0,1
    145         psrlw   xmm1,1
    146 
    147         packuswb xmm0,xmm1
    148 
    149         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    150 
    151         sub     ecx, byte SIZEOF_XMMWORD        ; outcol
    152         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
    153         add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
    154         cmp     ecx, byte SIZEOF_XMMWORD
    155         jae     short .columnloop
    156         test    ecx,ecx
    157         jnz     short .columnloop_r8
    158 
    159         pop     esi
    160         pop     edi
    161         pop     ecx
    162 
    163         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    164         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    165         dec     eax                             ; rowctr
    166         jg      near .rowloop
    167 
    168 .return:
    169         pop     edi
    170         pop     esi
    171 ;       pop     edx             ; need not be preserved
    172 ;       pop     ecx             ; need not be preserved
    173 ;       pop     ebx             ; unused
    174         pop     ebp
    175         ret
    176 
    177 ; --------------------------------------------------------------------------
    178 ;
    179 ; Downsample pixel values of a single component.
    180 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    181 ; without smoothing.
    182 ;
    183 ; GLOBAL(void)
    184 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    185 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    186 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    187 ;
    188 
    189 %define img_width(b)    (b)+8           ; JDIMENSION image_width
    190 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
    191 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
    192 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
    193 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
    194 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
    195 
    196         align   16
    197         global  EXTN(jsimd_h2v2_downsample_sse2)
    198 
    199 EXTN(jsimd_h2v2_downsample_sse2):
    200         push    ebp
    201         mov     ebp,esp
    202 ;       push    ebx             ; unused
    203 ;       push    ecx             ; need not be preserved
    204 ;       push    edx             ; need not be preserved
    205         push    esi
    206         push    edi
    207 
    208         mov     ecx, JDIMENSION [width_blks(ebp)]
    209         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
    210         jz      near .return
    211 
    212         mov     edx, JDIMENSION [img_width(ebp)]
    213 
    214         ; -- expand_right_edge
    215 
    216         push    ecx
    217         shl     ecx,1                           ; output_cols * 2
    218         sub     ecx,edx
    219         jle     short .expand_end
    220 
    221         mov     eax, INT [max_v_samp(ebp)]
    222         test    eax,eax
    223         jle     short .expand_end
    224 
    225         cld
    226         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    227         alignx  16,7
    228 .expandloop:
    229         push    eax
    230         push    ecx
    231 
    232         mov     edi, JSAMPROW [esi]
    233         add     edi,edx
    234         mov     al, JSAMPLE [edi-1]
    235 
    236         rep stosb
    237 
    238         pop     ecx
    239         pop     eax
    240 
    241         add     esi, byte SIZEOF_JSAMPROW
    242         dec     eax
    243         jg      short .expandloop
    244 
    245 .expand_end:
    246         pop     ecx                             ; output_cols
    247 
    248         ; -- h2v2_downsample
    249 
    250         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
    251         test    eax,eax
    252         jle     near .return
    253 
    254         mov     edx, 0x00020001         ; bias pattern
    255         movd    xmm7,edx
    256         pcmpeqw xmm6,xmm6
    257         pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    258         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    259 
    260         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    261         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    262         alignx  16,7
    263 .rowloop:
    264         push    ecx
    265         push    edi
    266         push    esi
    267 
    268         mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    269         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
    270         mov     edi, JSAMPROW [edi]                     ; outptr
    271 
    272         cmp     ecx, byte SIZEOF_XMMWORD
    273         jae     short .columnloop
    274         alignx  16,7
    275 
    276 .columnloop_r8:
    277         movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    278         movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    279         pxor    xmm2,xmm2
    280         pxor    xmm3,xmm3
    281         mov     ecx, SIZEOF_XMMWORD
    282         jmp     short .downsample
    283         alignx  16,7
    284 
    285 .columnloop:
    286         movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    287         movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    288         movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
    289         movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
    290 
    291 .downsample:
    292         movdqa  xmm4,xmm0
    293         movdqa  xmm5,xmm1
    294         pand    xmm0,xmm6
    295         psrlw   xmm4,BYTE_BIT
    296         pand    xmm1,xmm6
    297         psrlw   xmm5,BYTE_BIT
    298         paddw   xmm0,xmm4
    299         paddw   xmm1,xmm5
    300 
    301         movdqa  xmm4,xmm2
    302         movdqa  xmm5,xmm3
    303         pand    xmm2,xmm6
    304         psrlw   xmm4,BYTE_BIT
    305         pand    xmm3,xmm6
    306         psrlw   xmm5,BYTE_BIT
    307         paddw   xmm2,xmm4
    308         paddw   xmm3,xmm5
    309 
    310         paddw   xmm0,xmm1
    311         paddw   xmm2,xmm3
    312         paddw   xmm0,xmm7
    313         paddw   xmm2,xmm7
    314         psrlw   xmm0,2
    315         psrlw   xmm2,2
    316 
    317         packuswb xmm0,xmm2
    318 
    319         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    320 
    321         sub     ecx, byte SIZEOF_XMMWORD        ; outcol
    322         add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
    323         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
    324         add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
    325         cmp     ecx, byte SIZEOF_XMMWORD
    326         jae     near .columnloop
    327         test    ecx,ecx
    328         jnz     near .columnloop_r8
    329 
    330         pop     esi
    331         pop     edi
    332         pop     ecx
    333 
    334         add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
    335         add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
    336         dec     eax                             ; rowctr
    337         jg      near .rowloop
    338 
    339 .return:
    340         pop     edi
    341         pop     esi
    342 ;       pop     edx             ; need not be preserved
    343 ;       pop     ecx             ; need not be preserved
    344 ;       pop     ebx             ; unused
    345         pop     ebp
    346         ret
    347 
    348 ; For some reason, the OS X linker does not honor the request to align the
    349 ; segment unless we do this.
    350         align   16
    351