Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_TEXT
     23         BITS    32
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
     31 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     32 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
     33 ;
     34 
     35 %define img_width(b)    (b)+8           ; JDIMENSION image_width
     36 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
     37 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
     38 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
     39 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
     40 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
     41 
     42         align   16
     43         global  EXTN(jsimd_h2v1_downsample_mmx)
     44 
     45 EXTN(jsimd_h2v1_downsample_mmx):
     46         push    ebp
     47         mov     ebp,esp
     48 ;       push    ebx             ; unused
     49 ;       push    ecx             ; need not be preserved
     50 ;       push    edx             ; need not be preserved
     51         push    esi
     52         push    edi
     53 
     54         mov     ecx, JDIMENSION [width_blks(ebp)]
     55         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
     56         jz      near .return
     57 
     58         mov     edx, JDIMENSION [img_width(ebp)]
     59 
     60         ; -- expand_right_edge
     61 
     62         push    ecx
     63         shl     ecx,1                           ; output_cols * 2
     64         sub     ecx,edx
     65         jle     short .expand_end
     66 
     67         mov     eax, INT [max_v_samp(ebp)]
     68         test    eax,eax
     69         jle     short .expand_end
     70 
     71         cld
     72         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     73         alignx  16,7
     74 .expandloop:
     75         push    eax
     76         push    ecx
     77 
     78         mov     edi, JSAMPROW [esi]
     79         add     edi,edx
     80         mov     al, JSAMPLE [edi-1]
     81 
     82         rep stosb
     83 
     84         pop     ecx
     85         pop     eax
     86 
     87         add     esi, byte SIZEOF_JSAMPROW
     88         dec     eax
     89         jg      short .expandloop
     90 
     91 .expand_end:
     92         pop     ecx                             ; output_cols
     93 
     94         ; -- h2v1_downsample
     95 
     96         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
     97         test    eax,eax
     98         jle     near .return
     99 
    100         mov       edx, 0x00010000       ; bias pattern
    101         movd      mm7,edx
    102         pcmpeqw   mm6,mm6
    103         punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
    104         psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
    105 
    106         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    107         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    108         alignx  16,7
    109 .rowloop:
    110         push    ecx
    111         push    edi
    112         push    esi
    113 
    114         mov     esi, JSAMPROW [esi]             ; inptr
    115         mov     edi, JSAMPROW [edi]             ; outptr
    116         alignx  16,7
    117 .columnloop:
    118 
    119         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    120         movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
    121         movq    mm2,mm0
    122         movq    mm3,mm1
    123 
    124         pand    mm0,mm6
    125         psrlw   mm2,BYTE_BIT
    126         pand    mm1,mm6
    127         psrlw   mm3,BYTE_BIT
    128 
    129         paddw   mm0,mm2
    130         paddw   mm1,mm3
    131         paddw   mm0,mm7
    132         paddw   mm1,mm7
    133         psrlw   mm0,1
    134         psrlw   mm1,1
    135 
    136         packuswb mm0,mm1
    137 
    138         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    139 
    140         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
    141         add     edi, byte 1*SIZEOF_MMWORD       ; outptr
    142         sub     ecx, byte SIZEOF_MMWORD         ; outcol
    143         jnz     short .columnloop
    144 
    145         pop     esi
    146         pop     edi
    147         pop     ecx
    148 
    149         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    150         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    151         dec     eax                             ; rowctr
    152         jg      short .rowloop
    153 
    154         emms            ; empty MMX state
    155 
    156 .return:
    157         pop     edi
    158         pop     esi
    159 ;       pop     edx             ; need not be preserved
    160 ;       pop     ecx             ; need not be preserved
    161 ;       pop     ebx             ; unused
    162         pop     ebp
    163         ret
    164 
    165 ; --------------------------------------------------------------------------
    166 ;
    167 ; Downsample pixel values of a single component.
    168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    169 ; without smoothing.
    170 ;
    171 ; GLOBAL(void)
    172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
    173 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    174 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
    175 ;
    176 
    177 %define img_width(b)    (b)+8           ; JDIMENSION image_width
    178 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
    179 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
    180 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
    181 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
    182 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
    183 
    184         align   16
    185         global  EXTN(jsimd_h2v2_downsample_mmx)
    186 
    187 EXTN(jsimd_h2v2_downsample_mmx):
    188         push    ebp
    189         mov     ebp,esp
    190 ;       push    ebx             ; unused
    191 ;       push    ecx             ; need not be preserved
    192 ;       push    edx             ; need not be preserved
    193         push    esi
    194         push    edi
    195 
    196         mov     ecx, JDIMENSION [width_blks(ebp)]
    197         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
    198         jz      near .return
    199 
    200         mov     edx, JDIMENSION [img_width(ebp)]
    201 
    202         ; -- expand_right_edge
    203 
    204         push    ecx
    205         shl     ecx,1                           ; output_cols * 2
    206         sub     ecx,edx
    207         jle     short .expand_end
    208 
    209         mov     eax, INT [max_v_samp(ebp)]
    210         test    eax,eax
    211         jle     short .expand_end
    212 
    213         cld
    214         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    215         alignx  16,7
    216 .expandloop:
    217         push    eax
    218         push    ecx
    219 
    220         mov     edi, JSAMPROW [esi]
    221         add     edi,edx
    222         mov     al, JSAMPLE [edi-1]
    223 
    224         rep stosb
    225 
    226         pop     ecx
    227         pop     eax
    228 
    229         add     esi, byte SIZEOF_JSAMPROW
    230         dec     eax
    231         jg      short .expandloop
    232 
    233 .expand_end:
    234         pop     ecx                             ; output_cols
    235 
    236         ; -- h2v2_downsample
    237 
    238         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
    239         test    eax,eax
    240         jle     near .return
    241 
    242         mov       edx, 0x00020001       ; bias pattern
    243         movd      mm7,edx
    244         pcmpeqw   mm6,mm6
    245         punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
    246         psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
    247 
    248         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    249         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    250         alignx  16,7
    251 .rowloop:
    252         push    ecx
    253         push    edi
    254         push    esi
    255 
    256         mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    257         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
    258         mov     edi, JSAMPROW [edi]                     ; outptr
    259         alignx  16,7
    260 .columnloop:
    261 
    262         movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
    263         movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    264         movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
    265         movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
    266 
    267         movq    mm4,mm0
    268         movq    mm5,mm1
    269         pand    mm0,mm6
    270         psrlw   mm4,BYTE_BIT
    271         pand    mm1,mm6
    272         psrlw   mm5,BYTE_BIT
    273         paddw   mm0,mm4
    274         paddw   mm1,mm5
    275 
    276         movq    mm4,mm2
    277         movq    mm5,mm3
    278         pand    mm2,mm6
    279         psrlw   mm4,BYTE_BIT
    280         pand    mm3,mm6
    281         psrlw   mm5,BYTE_BIT
    282         paddw   mm2,mm4
    283         paddw   mm3,mm5
    284 
    285         paddw   mm0,mm1
    286         paddw   mm2,mm3
    287         paddw   mm0,mm7
    288         paddw   mm2,mm7
    289         psrlw   mm0,2
    290         psrlw   mm2,2
    291 
    292         packuswb mm0,mm2
    293 
    294         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    295 
    296         add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
    297         add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
    298         add     edi, byte 1*SIZEOF_MMWORD       ; outptr
    299         sub     ecx, byte SIZEOF_MMWORD         ; outcol
    300         jnz     near .columnloop
    301 
    302         pop     esi
    303         pop     edi
    304         pop     ecx
    305 
    306         add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
    307         add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
    308         dec     eax                             ; rowctr
    309         jg      near .rowloop
    310 
    311         emms            ; empty MMX state
    312 
    313 .return:
    314         pop     edi
    315         pop     esi
    316 ;       pop     edx             ; need not be preserved
    317 ;       pop     ecx             ; need not be preserved
    318 ;       pop     ebx             ; unused
    319         pop     ebp
    320         ret
    321 
    322 ; For some reason, the OS X linker does not honor the request to align the
    323 ; segment unless we do this.
    324         align   16
    325