Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jcsample.asm - downsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22     SECTION     SEG_TEXT
     23     BITS        32
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
     31 ;                           JDIMENSION v_samp_factor,
     32 ;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     33 ;                           JSAMPARRAY output_data);
     34 ;
     35 
     36 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
     37 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
     38 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
     39 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
     40 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
     41 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
     42 
     43     align       32
     44     GLOBAL_FUNCTION(jsimd_h2v1_downsample_mmx)
     45 
     46 EXTN(jsimd_h2v1_downsample_mmx):
     47     push        ebp
     48     mov         ebp, esp
     49 ;   push        ebx                     ; unused
     50 ;   push        ecx                     ; need not be preserved
     51 ;   push        edx                     ; need not be preserved
     52     push        esi
     53     push        edi
     54 
     55     mov         ecx, JDIMENSION [width_blks(ebp)]
     56     shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
     57     jz          near .return
     58 
     59     mov         edx, JDIMENSION [img_width(ebp)]
     60 
     61     ; -- expand_right_edge
     62 
     63     push        ecx
     64     shl         ecx, 1                  ; output_cols * 2
     65     sub         ecx, edx
     66     jle         short .expand_end
     67 
     68     mov         eax, INT [max_v_samp(ebp)]
     69     test        eax, eax
     70     jle         short .expand_end
     71 
     72     cld
     73     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
     74     alignx      16, 7
     75 .expandloop:
     76     push        eax
     77     push        ecx
     78 
     79     mov         edi, JSAMPROW [esi]
     80     add         edi, edx
     81     mov         al, JSAMPLE [edi-1]
     82 
     83     rep stosb
     84 
     85     pop         ecx
     86     pop         eax
     87 
     88     add         esi, byte SIZEOF_JSAMPROW
     89     dec         eax
     90     jg          short .expandloop
     91 
     92 .expand_end:
     93     pop         ecx                     ; output_cols
     94 
     95     ; -- h2v1_downsample
     96 
     97     mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
     98     test        eax, eax
     99     jle         near .return
    100 
    101     mov         edx, 0x00010000         ; bias pattern
    102     movd        mm7, edx
    103     pcmpeqw     mm6, mm6
    104     punpckldq   mm7, mm7                ; mm7={0, 1, 0, 1}
    105     psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
    106 
    107     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    108     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    109     alignx      16, 7
    110 .rowloop:
    111     push        ecx
    112     push        edi
    113     push        esi
    114 
    115     mov         esi, JSAMPROW [esi]     ; inptr
    116     mov         edi, JSAMPROW [edi]     ; outptr
    117     alignx      16, 7
    118 .columnloop:
    119 
    120     movq        mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    121     movq        mm1, MMWORD [esi+1*SIZEOF_MMWORD]
    122     movq        mm2, mm0
    123     movq        mm3, mm1
    124 
    125     pand        mm0, mm6
    126     psrlw       mm2, BYTE_BIT
    127     pand        mm1, mm6
    128     psrlw       mm3, BYTE_BIT
    129 
    130     paddw       mm0, mm2
    131     paddw       mm1, mm3
    132     paddw       mm0, mm7
    133     paddw       mm1, mm7
    134     psrlw       mm0, 1
    135     psrlw       mm1, 1
    136 
    137     packuswb    mm0, mm1
    138 
    139     movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    140 
    141     add         esi, byte 2*SIZEOF_MMWORD  ; inptr
    142     add         edi, byte 1*SIZEOF_MMWORD  ; outptr
    143     sub         ecx, byte SIZEOF_MMWORD    ; outcol
    144     jnz         short .columnloop
    145 
    146     pop         esi
    147     pop         edi
    148     pop         ecx
    149 
    150     add         esi, byte SIZEOF_JSAMPROW  ; input_data
    151     add         edi, byte SIZEOF_JSAMPROW  ; output_data
    152     dec         eax                        ; rowctr
    153     jg          short .rowloop
    154 
    155     emms                                ; empty MMX state
    156 
    157 .return:
    158     pop         edi
    159     pop         esi
    160 ;   pop         edx                     ; need not be preserved
    161 ;   pop         ecx                     ; need not be preserved
    162 ;   pop         ebx                     ; unused
    163     pop         ebp
    164     ret
    165 
    166 ; --------------------------------------------------------------------------
    167 ;
    168 ; Downsample pixel values of a single component.
    169 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    170 ; without smoothing.
    171 ;
    172 ; GLOBAL(void)
    173 ; jsimd_h2v2_downsample_mmx(JDIMENSION image_width, int max_v_samp_factor,
    174 ;                           JDIMENSION v_samp_factor,
    175 ;                           JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    176 ;                           JSAMPARRAY output_data);
    177 ;
    178 
    179 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
    180 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
    181 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
    182 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
    183 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
    184 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
    185 
    186     align       32
    187     GLOBAL_FUNCTION(jsimd_h2v2_downsample_mmx)
    188 
    189 EXTN(jsimd_h2v2_downsample_mmx):
    190     push        ebp
    191     mov         ebp, esp
    192 ;   push        ebx                     ; unused
    193 ;   push        ecx                     ; need not be preserved
    194 ;   push        edx                     ; need not be preserved
    195     push        esi
    196     push        edi
    197 
    198     mov         ecx, JDIMENSION [width_blks(ebp)]
    199     shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
    200     jz          near .return
    201 
    202     mov         edx, JDIMENSION [img_width(ebp)]
    203 
    204     ; -- expand_right_edge
    205 
    206     push        ecx
    207     shl         ecx, 1                  ; output_cols * 2
    208     sub         ecx, edx
    209     jle         short .expand_end
    210 
    211     mov         eax, INT [max_v_samp(ebp)]
    212     test        eax, eax
    213     jle         short .expand_end
    214 
    215     cld
    216     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
    217     alignx      16, 7
    218 .expandloop:
    219     push        eax
    220     push        ecx
    221 
    222     mov         edi, JSAMPROW [esi]
    223     add         edi, edx
    224     mov         al, JSAMPLE [edi-1]
    225 
    226     rep stosb
    227 
    228     pop         ecx
    229     pop         eax
    230 
    231     add         esi, byte SIZEOF_JSAMPROW
    232     dec         eax
    233     jg          short .expandloop
    234 
    235 .expand_end:
    236     pop         ecx                     ; output_cols
    237 
    238     ; -- h2v2_downsample
    239 
    240     mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
    241     test        eax, eax
    242     jle         near .return
    243 
    244     mov         edx, 0x00020001         ; bias pattern
    245     movd        mm7, edx
    246     pcmpeqw     mm6, mm6
    247     punpckldq   mm7, mm7                ; mm7={1, 2, 1, 2}
    248     psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
    249 
    250     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    251     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    252     alignx      16, 7
    253 .rowloop:
    254     push        ecx
    255     push        edi
    256     push        esi
    257 
    258     mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    259     mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
    260     mov         edi, JSAMPROW [edi]                    ; outptr
    261     alignx      16, 7
    262 .columnloop:
    263 
    264     movq        mm0, MMWORD [edx+0*SIZEOF_MMWORD]
    265     movq        mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    266     movq        mm2, MMWORD [edx+1*SIZEOF_MMWORD]
    267     movq        mm3, MMWORD [esi+1*SIZEOF_MMWORD]
    268 
    269     movq        mm4, mm0
    270     movq        mm5, mm1
    271     pand        mm0, mm6
    272     psrlw       mm4, BYTE_BIT
    273     pand        mm1, mm6
    274     psrlw       mm5, BYTE_BIT
    275     paddw       mm0, mm4
    276     paddw       mm1, mm5
    277 
    278     movq        mm4, mm2
    279     movq        mm5, mm3
    280     pand        mm2, mm6
    281     psrlw       mm4, BYTE_BIT
    282     pand        mm3, mm6
    283     psrlw       mm5, BYTE_BIT
    284     paddw       mm2, mm4
    285     paddw       mm3, mm5
    286 
    287     paddw       mm0, mm1
    288     paddw       mm2, mm3
    289     paddw       mm0, mm7
    290     paddw       mm2, mm7
    291     psrlw       mm0, 2
    292     psrlw       mm2, 2
    293 
    294     packuswb    mm0, mm2
    295 
    296     movq        MMWORD [edi+0*SIZEOF_MMWORD], mm0
    297 
    298     add         edx, byte 2*SIZEOF_MMWORD  ; inptr0
    299     add         esi, byte 2*SIZEOF_MMWORD  ; inptr1
    300     add         edi, byte 1*SIZEOF_MMWORD  ; outptr
    301     sub         ecx, byte SIZEOF_MMWORD    ; outcol
    302     jnz         near .columnloop
    303 
    304     pop         esi
    305     pop         edi
    306     pop         ecx
    307 
    308     add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
    309     add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
    310     dec         eax                          ; rowctr
    311     jg          near .rowloop
    312 
    313     emms                                ; empty MMX state
    314 
    315 .return:
    316     pop         edi
    317     pop         esi
    318 ;   pop         edx                     ; need not be preserved
    319 ;   pop         ecx                     ; need not be preserved
    320 ;   pop         ebx                     ; unused
    321     pop         ebp
    322     ret
    323 
    324 ; For some reason, the OS X linker does not honor the request to align the
    325 ; segment unless we do this.
    326     align       32
    327