Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; [TAB8]
     17 
     18 %include "jsimdext.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21         SECTION SEG_TEXT
     22         BITS    32
     23 ;
     24 ; Downsample pixel values of a single component.
     25 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     26 ; without smoothing.
     27 ;
     28 ; GLOBAL(void)
     29 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
     30 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     31 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
     32 ;
     33 
     34 %define img_width(b)    (b)+8           ; JDIMENSION image_width
     35 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
     36 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
     37 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
     38 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
     39 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
     40 
     41         align   16
     42         global  EXTN(jsimd_h2v1_downsample_mmx)
     43 
     44 EXTN(jsimd_h2v1_downsample_mmx):
     45         push    ebp
     46         mov     ebp,esp
     47 ;       push    ebx             ; unused
     48 ;       push    ecx             ; need not be preserved
     49 ;       push    edx             ; need not be preserved
     50         push    esi
     51         push    edi
     52 
     53         mov     ecx, JDIMENSION [width_blks(ebp)]
     54         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
     55         jz      near .return
     56 
     57         mov     edx, JDIMENSION [img_width(ebp)]
     58 
     59         ; -- expand_right_edge
     60 
     61         push    ecx
     62         shl     ecx,1                           ; output_cols * 2
     63         sub     ecx,edx
     64         jle     short .expand_end
     65 
     66         mov     eax, INT [max_v_samp(ebp)]
     67         test    eax,eax
     68         jle     short .expand_end
     69 
     70         cld
     71         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     72         alignx  16,7
     73 .expandloop:
     74         push    eax
     75         push    ecx
     76 
     77         mov     edi, JSAMPROW [esi]
     78         add     edi,edx
     79         mov     al, JSAMPLE [edi-1]
     80 
     81         rep stosb
     82 
     83         pop     ecx
     84         pop     eax
     85 
     86         add     esi, byte SIZEOF_JSAMPROW
     87         dec     eax
     88         jg      short .expandloop
     89 
     90 .expand_end:
     91         pop     ecx                             ; output_cols
     92 
     93         ; -- h2v1_downsample
     94 
     95         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
     96         test    eax,eax
     97         jle     near .return
     98 
     99         mov       edx, 0x00010000       ; bias pattern
    100         movd      mm7,edx
    101         pcmpeqw   mm6,mm6
    102         punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
    103         psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
    104 
    105         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    106         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    107         alignx  16,7
    108 .rowloop:
    109         push    ecx
    110         push    edi
    111         push    esi
    112 
    113         mov     esi, JSAMPROW [esi]             ; inptr
    114         mov     edi, JSAMPROW [edi]             ; outptr
    115         alignx  16,7
    116 .columnloop:
    117 
    118         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    119         movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
    120         movq    mm2,mm0
    121         movq    mm3,mm1
    122 
    123         pand    mm0,mm6
    124         psrlw   mm2,BYTE_BIT
    125         pand    mm1,mm6
    126         psrlw   mm3,BYTE_BIT
    127 
    128         paddw   mm0,mm2
    129         paddw   mm1,mm3
    130         paddw   mm0,mm7
    131         paddw   mm1,mm7
    132         psrlw   mm0,1
    133         psrlw   mm1,1
    134 
    135         packuswb mm0,mm1
    136 
    137         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    138 
    139         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
    140         add     edi, byte 1*SIZEOF_MMWORD       ; outptr
    141         sub     ecx, byte SIZEOF_MMWORD         ; outcol
    142         jnz     short .columnloop
    143 
    144         pop     esi
    145         pop     edi
    146         pop     ecx
    147 
    148         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    149         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    150         dec     eax                             ; rowctr
    151         jg      short .rowloop
    152 
    153         emms            ; empty MMX state
    154 
    155 .return:
    156         pop     edi
    157         pop     esi
    158 ;       pop     edx             ; need not be preserved
    159 ;       pop     ecx             ; need not be preserved
    160 ;       pop     ebx             ; unused
    161         pop     ebp
    162         ret
    163 
    164 ; --------------------------------------------------------------------------
    165 ;
    166 ; Downsample pixel values of a single component.
    167 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    168 ; without smoothing.
    169 ;
    170 ; GLOBAL(void)
    171 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
    172 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    173 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
    174 ;
    175 
    176 %define img_width(b)    (b)+8           ; JDIMENSION image_width
    177 %define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
    178 %define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
    179 %define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
    180 %define input_data(b)   (b)+24          ; JSAMPARRAY input_data
    181 %define output_data(b)  (b)+28          ; JSAMPARRAY output_data
    182 
    183         align   16
    184         global  EXTN(jsimd_h2v2_downsample_mmx)
    185 
    186 EXTN(jsimd_h2v2_downsample_mmx):
    187         push    ebp
    188         mov     ebp,esp
    189 ;       push    ebx             ; unused
    190 ;       push    ecx             ; need not be preserved
    191 ;       push    edx             ; need not be preserved
    192         push    esi
    193         push    edi
    194 
    195         mov     ecx, JDIMENSION [width_blks(ebp)]
    196         shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
    197         jz      near .return
    198 
    199         mov     edx, JDIMENSION [img_width(ebp)]
    200 
    201         ; -- expand_right_edge
    202 
    203         push    ecx
    204         shl     ecx,1                           ; output_cols * 2
    205         sub     ecx,edx
    206         jle     short .expand_end
    207 
    208         mov     eax, INT [max_v_samp(ebp)]
    209         test    eax,eax
    210         jle     short .expand_end
    211 
    212         cld
    213         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    214         alignx  16,7
    215 .expandloop:
    216         push    eax
    217         push    ecx
    218 
    219         mov     edi, JSAMPROW [esi]
    220         add     edi,edx
    221         mov     al, JSAMPLE [edi-1]
    222 
    223         rep stosb
    224 
    225         pop     ecx
    226         pop     eax
    227 
    228         add     esi, byte SIZEOF_JSAMPROW
    229         dec     eax
    230         jg      short .expandloop
    231 
    232 .expand_end:
    233         pop     ecx                             ; output_cols
    234 
    235         ; -- h2v2_downsample
    236 
    237         mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
    238         test    eax,eax
    239         jle     near .return
    240 
    241         mov       edx, 0x00020001       ; bias pattern
    242         movd      mm7,edx
    243         pcmpeqw   mm6,mm6
    244         punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
    245         psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
    246 
    247         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    248         mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
    249         alignx  16,7
    250 .rowloop:
    251         push    ecx
    252         push    edi
    253         push    esi
    254 
    255         mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    256         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
    257         mov     edi, JSAMPROW [edi]                     ; outptr
    258         alignx  16,7
    259 .columnloop:
    260 
    261         movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
    262         movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    263         movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
    264         movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
    265 
    266         movq    mm4,mm0
    267         movq    mm5,mm1
    268         pand    mm0,mm6
    269         psrlw   mm4,BYTE_BIT
    270         pand    mm1,mm6
    271         psrlw   mm5,BYTE_BIT
    272         paddw   mm0,mm4
    273         paddw   mm1,mm5
    274 
    275         movq    mm4,mm2
    276         movq    mm5,mm3
    277         pand    mm2,mm6
    278         psrlw   mm4,BYTE_BIT
    279         pand    mm3,mm6
    280         psrlw   mm5,BYTE_BIT
    281         paddw   mm2,mm4
    282         paddw   mm3,mm5
    283 
    284         paddw   mm0,mm1
    285         paddw   mm2,mm3
    286         paddw   mm0,mm7
    287         paddw   mm2,mm7
    288         psrlw   mm0,2
    289         psrlw   mm2,2
    290 
    291         packuswb mm0,mm2
    292 
    293         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    294 
    295         add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
    296         add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
    297         add     edi, byte 1*SIZEOF_MMWORD       ; outptr
    298         sub     ecx, byte SIZEOF_MMWORD         ; outcol
    299         jnz     near .columnloop
    300 
    301         pop     esi
    302         pop     edi
    303         pop     ecx
    304 
    305         add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
    306         add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
    307         dec     eax                             ; rowctr
    308         jg      near .rowloop
    309 
    310         emms            ; empty MMX state
    311 
    312 .return:
    313         pop     edi
    314         pop     esi
    315 ;       pop     edx             ; need not be preserved
    316 ;       pop     ecx             ; need not be preserved
    317 ;       pop     ebx             ; unused
    318         pop     ebp
    319         ret
    320 
    321 ; For some reason, the OS X linker does not honor the request to align the
    322 ; segment unless we do this.
    323         align   16
    324