Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_TEXT
     23         BITS    32
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     31 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     32 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     33 ;
     34 
     35 %define img_width(b)    (b)+8           ; JDIMENSION image_width
     36 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
     37 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
     38 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
     39 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
     40 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
     41 
     42         align   16
     43         global  EXTN(jsimd_h2v1_downsample_sse2)
     44 
     45 EXTN(jsimd_h2v1_downsample_sse2):
     46         push    ebp
     47         mov     ebp,esp
     48 ;       push    ebx             ; unused
     49 ;       push    ecx             ; need not be preserved
     50 ;       push    edx             ; need not be preserved
     51         push    esi
     52         push    edi
     53 
     54         mov     ecx, JDIMENSION [width_blks(ebp)]
     55         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
     56         jz      near .return
     57 
     58         mov     edx, JDIMENSION [img_width(ebp)]
     59 
     60         ; -- expand_right_edge
     61 
     62         push    ecx
     63         shl     ecx,1                           ; output_cols * 2
     64         sub     ecx,edx
     65         jle     short .expand_end
     66 
     67         mov     eax, INT [max_v_samp(ebp)]
     68         test    eax,eax
     69         jle     short .expand_end
     70 
     71         cld
     72         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     73         alignx  16,7
     74 .expandloop:
     75         push    eax
     76         push    ecx
     77 
     78         mov     edi, JSAMPROW [esi]
     79         add     edi,edx
     80         mov     al, JSAMPLE [edi-1]
     81 
     82         rep stosb
     83 
     84         pop     ecx
     85         pop     eax
     86 
     87         add     esi, byte SIZEOF_JSAMPROW
     88         dec     eax
     89         jg      short .expandloop
     90 
     91 .expand_end:
     92         pop     ecx                             ; output_cols
     93 
     94         ; -- h2v1_downsample
     95 
     96         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
     97         test    eax,eax
     98         jle     near .return
     99 
    100         mov     edx, 0x00010000         ; bias pattern
    101         movd    xmm7,edx
    102         pcmpeqw xmm6,xmm6
    103         pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    104         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    105 
    106         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    107         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    108         alignx  16,7
    109 .rowloop:
    110         push    ecx
    111         push    edi
    112         push    esi
    113 
    114         mov     esi, JSAMPROW [esi]             ; inptr
    115         mov     edi, JSAMPROW [edi]             ; outptr
    116 
    117         cmp     ecx, byte SIZEOF_XMMWORD
    118         jae     short .columnloop
    119         alignx  16,7
    120 
    121 .columnloop_r8:
    122         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    123         pxor    xmm1,xmm1
    124         mov     ecx, SIZEOF_XMMWORD
    125         jmp     short .downsample
    126         alignx  16,7
    127 
    128 .columnloop:
    129         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    130         movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
    131 
    132 .downsample:
    133         movdqa  xmm2,xmm0
    134         movdqa  xmm3,xmm1
    135 
    136         pand    xmm0,xmm6
    137         psrlw   xmm2,BYTE_BIT
    138         pand    xmm1,xmm6
    139         psrlw   xmm3,BYTE_BIT
    140 
    141         paddw   xmm0,xmm2
    142         paddw   xmm1,xmm3
    143         paddw   xmm0,xmm7
    144         paddw   xmm1,xmm7
    145         psrlw   xmm0,1
    146         psrlw   xmm1,1
    147 
    148         packuswb xmm0,xmm1
    149 
    150         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    151 
    152         sub     ecx, byte SIZEOF_XMMWORD        ; outcol
    153         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
    154         add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
    155         cmp     ecx, byte SIZEOF_XMMWORD
    156         jae     short .columnloop
    157         test    ecx,ecx
    158         jnz     short .columnloop_r8
    159 
    160         pop     esi
    161         pop     edi
    162         pop     ecx
    163 
    164         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    165         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    166         dec     eax                             ; rowctr
    167         jg      near .rowloop
    168 
    169 .return:
    170         pop     edi
    171         pop     esi
    172 ;       pop     edx             ; need not be preserved
    173 ;       pop     ecx             ; need not be preserved
    174 ;       pop     ebx             ; unused
    175         pop     ebp
    176         ret
    177 
    178 ; --------------------------------------------------------------------------
    179 ;
    180 ; Downsample pixel values of a single component.
    181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    182 ; without smoothing.
    183 ;
    184 ; GLOBAL(void)
    185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    186 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    187 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    188 ;
    189 
    190 %define img_width(b)    (b)+8           ; JDIMENSION image_width
    191 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
    192 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
    193 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
    194 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
    195 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
    196 
    197         align   16
    198         global  EXTN(jsimd_h2v2_downsample_sse2)
    199 
    200 EXTN(jsimd_h2v2_downsample_sse2):
    201         push    ebp
    202         mov     ebp,esp
    203 ;       push    ebx             ; unused
    204 ;       push    ecx             ; need not be preserved
    205 ;       push    edx             ; need not be preserved
    206         push    esi
    207         push    edi
    208 
    209         mov     ecx, JDIMENSION [width_blks(ebp)]
    210         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
    211         jz      near .return
    212 
    213         mov     edx, JDIMENSION [img_width(ebp)]
    214 
    215         ; -- expand_right_edge
    216 
    217         push    ecx
    218         shl     ecx,1                           ; output_cols * 2
    219         sub     ecx,edx
    220         jle     short .expand_end
    221 
    222         mov     eax, INT [max_v_samp(ebp)]
    223         test    eax,eax
    224         jle     short .expand_end
    225 
    226         cld
    227         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    228         alignx  16,7
    229 .expandloop:
    230         push    eax
    231         push    ecx
    232 
    233         mov     edi, JSAMPROW [esi]
    234         add     edi,edx
    235         mov     al, JSAMPLE [edi-1]
    236 
    237         rep stosb
    238 
    239         pop     ecx
    240         pop     eax
    241 
    242         add     esi, byte SIZEOF_JSAMPROW
    243         dec     eax
    244         jg      short .expandloop
    245 
    246 .expand_end:
    247         pop     ecx                             ; output_cols
    248 
    249         ; -- h2v2_downsample
    250 
    251         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
    252         test    eax,eax
    253         jle     near .return
    254 
    255         mov     edx, 0x00020001         ; bias pattern
    256         movd    xmm7,edx
    257         pcmpeqw xmm6,xmm6
    258         pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    259         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    260 
    261         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    262         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    263         alignx  16,7
    264 .rowloop:
    265         push    ecx
    266         push    edi
    267         push    esi
    268 
    269         mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    270         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
    271         mov     edi, JSAMPROW [edi]                     ; outptr
    272 
    273         cmp     ecx, byte SIZEOF_XMMWORD
    274         jae     short .columnloop
    275         alignx  16,7
    276 
    277 .columnloop_r8:
    278         movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    279         movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    280         pxor    xmm2,xmm2
    281         pxor    xmm3,xmm3
    282         mov     ecx, SIZEOF_XMMWORD
    283         jmp     short .downsample
    284         alignx  16,7
    285 
    286 .columnloop:
    287         movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    288         movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    289         movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
    290         movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
    291 
    292 .downsample:
    293         movdqa  xmm4,xmm0
    294         movdqa  xmm5,xmm1
    295         pand    xmm0,xmm6
    296         psrlw   xmm4,BYTE_BIT
    297         pand    xmm1,xmm6
    298         psrlw   xmm5,BYTE_BIT
    299         paddw   xmm0,xmm4
    300         paddw   xmm1,xmm5
    301 
    302         movdqa  xmm4,xmm2
    303         movdqa  xmm5,xmm3
    304         pand    xmm2,xmm6
    305         psrlw   xmm4,BYTE_BIT
    306         pand    xmm3,xmm6
    307         psrlw   xmm5,BYTE_BIT
    308         paddw   xmm2,xmm4
    309         paddw   xmm3,xmm5
    310 
    311         paddw   xmm0,xmm1
    312         paddw   xmm2,xmm3
    313         paddw   xmm0,xmm7
    314         paddw   xmm2,xmm7
    315         psrlw   xmm0,2
    316         psrlw   xmm2,2
    317 
    318         packuswb xmm0,xmm2
    319 
    320         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    321 
    322         sub     ecx, byte SIZEOF_XMMWORD        ; outcol
    323         add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
    324         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
    325         add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
    326         cmp     ecx, byte SIZEOF_XMMWORD
    327         jae     near .columnloop
    328         test    ecx,ecx
    329         jnz     near .columnloop_r8
    330 
    331         pop     esi
    332         pop     edi
    333         pop     ecx
    334 
    335         add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
    336         add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
    337         dec     eax                             ; rowctr
    338         jg      near .rowloop
    339 
    340 .return:
    341         pop     edi
    342         pop     esi
    343 ;       pop     edx             ; need not be preserved
    344 ;       pop     ecx             ; need not be preserved
    345 ;       pop     ebx             ; unused
    346         pop     ebp
    347         ret
    348 
    349 ; For some reason, the OS X linker does not honor the request to align the
    350 ; segment unless we do this.
    351         align   16
    352