Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsample.asm - upsampling (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on
      7 ; x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_CONST
     23 
     24         alignz  16
     25         global  EXTN(jconst_fancy_upsample_mmx)
     26 
     27 EXTN(jconst_fancy_upsample_mmx):
     28 
     29 PW_ONE          times 4 dw  1
     30 PW_TWO          times 4 dw  2
     31 PW_THREE        times 4 dw  3
     32 PW_SEVEN        times 4 dw  7
     33 PW_EIGHT        times 4 dw  8
     34 
     35         alignz  16
     36 
     37 ; --------------------------------------------------------------------------
     38         SECTION SEG_TEXT
     39         BITS    32
     40 ;
     41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     42 ;
     43 ; The upsampling algorithm is linear interpolation between pixel centers,
     44 ; also known as a "triangle filter".  This is a good compromise between
     45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     46 ; of the way between input pixel centers.
     47 ;
     48 ; GLOBAL(void)
     49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
     50 ;                                JDIMENSION downsampled_width,
     51 ;                                JSAMPARRAY input_data,
     52 ;                                JSAMPARRAY * output_data_ptr);
     53 ;
     54 
     55 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
     56 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
     57 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
     58 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
     59 
     60         align   16
     61         global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
     62 
     63 EXTN(jsimd_h2v1_fancy_upsample_mmx):
     64         push    ebp
     65         mov     ebp,esp
     66         pushpic ebx
     67 ;       push    ecx             ; need not be preserved
     68 ;       push    edx             ; need not be preserved
     69         push    esi
     70         push    edi
     71 
     72         get_GOT ebx             ; get GOT address
     73 
     74         mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     75         test    eax,eax
     76         jz      near .return
     77 
     78         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
     79         test    ecx,ecx
     80         jz      near .return
     81 
     82         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     83         mov     edi, POINTER [output_data_ptr(ebp)]
     84         mov     edi, JSAMPARRAY [edi]                   ; output_data
     85         alignx  16,7
     86 .rowloop:
     87         push    eax                     ; colctr
     88         push    edi
     89         push    esi
     90 
     91         mov     esi, JSAMPROW [esi]     ; inptr
     92         mov     edi, JSAMPROW [edi]     ; outptr
     93 
     94         test    eax, SIZEOF_MMWORD-1
     95         jz      short .skip
     96         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
     97         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     98 .skip:
     99         pxor    mm0,mm0                 ; mm0=(all 0's)
    100         pcmpeqb mm7,mm7
    101         psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
    102         pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
    103 
    104         add     eax, byte SIZEOF_MMWORD-1
    105         and     eax, byte -SIZEOF_MMWORD
    106         cmp     eax, byte SIZEOF_MMWORD
    107         ja      short .columnloop
    108         alignx  16,7
    109 
    110 .columnloop_last:
    111         pcmpeqb mm6,mm6
    112         psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
    113         pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
    114         jmp     short .upsample
    115         alignx  16,7
    116 
    117 .columnloop:
    118         movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
    119         psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
    120 
    121 .upsample:
    122         movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
    123         movq    mm2,mm1
    124         movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
    125         psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
    126         psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
    127 
    128         por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
    129         por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
    130 
    131         movq    mm7,mm1
    132         psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
    133 
    134         movq      mm4,mm1
    135         punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
    136         punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
    137         movq      mm5,mm2
    138         punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
    139         punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
    140         movq      mm6,mm3
    141         punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
    142         punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
    143 
    144         pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
    145         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
    146         paddw   mm2,[GOTOFF(ebx,PW_ONE)]
    147         paddw   mm5,[GOTOFF(ebx,PW_ONE)]
    148         paddw   mm3,[GOTOFF(ebx,PW_TWO)]
    149         paddw   mm6,[GOTOFF(ebx,PW_TWO)]
    150 
    151         paddw   mm2,mm1
    152         paddw   mm5,mm4
    153         psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
    154         psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
    155         paddw   mm3,mm1
    156         paddw   mm6,mm4
    157         psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
    158         psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
    159 
    160         psllw   mm3,BYTE_BIT
    161         psllw   mm6,BYTE_BIT
    162         por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
    163         por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
    164 
    165         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
    166         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
    167 
    168         sub     eax, byte SIZEOF_MMWORD
    169         add     esi, byte 1*SIZEOF_MMWORD       ; inptr
    170         add     edi, byte 2*SIZEOF_MMWORD       ; outptr
    171         cmp     eax, byte SIZEOF_MMWORD
    172         ja      near .columnloop
    173         test    eax,eax
    174         jnz     near .columnloop_last
    175 
    176         pop     esi
    177         pop     edi
    178         pop     eax
    179 
    180         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    181         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    182         dec     ecx                             ; rowctr
    183         jg      near .rowloop
    184 
    185         emms            ; empty MMX state
    186 
    187 .return:
    188         pop     edi
    189         pop     esi
    190 ;       pop     edx             ; need not be preserved
    191 ;       pop     ecx             ; need not be preserved
    192         poppic  ebx
    193         pop     ebp
    194         ret
    195 
    196 ; --------------------------------------------------------------------------
    197 ;
    198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    199 ; Again a triangle filter; see comments for h2v1 case, above.
    200 ;
    201 ; GLOBAL(void)
    202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
    203 ;                                JDIMENSION downsampled_width,
    204 ;                                JSAMPARRAY input_data,
    205 ;                                JSAMPARRAY * output_data_ptr);
    206 ;
    207 
    208 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    209 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
    210 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    211 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
    212 
    213 %define original_ebp    ebp+0
    214 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
    215 %define WK_NUM          4
    216 %define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
    217 
    218         align   16
    219         global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
    220 
    221 EXTN(jsimd_h2v2_fancy_upsample_mmx):
    222         push    ebp
    223         mov     eax,esp                         ; eax = original ebp
    224         sub     esp, byte 4
    225         and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
    226         mov     [esp],eax
    227         mov     ebp,esp                         ; ebp = aligned ebp
    228         lea     esp, [wk(0)]
    229         pushpic eax             ; make a room for GOT address
    230         push    ebx
    231 ;       push    ecx             ; need not be preserved
    232 ;       push    edx             ; need not be preserved
    233         push    esi
    234         push    edi
    235 
    236         get_GOT ebx                     ; get GOT address
    237         movpic  POINTER [gotptr], ebx   ; save GOT address
    238 
    239         mov     edx,eax                         ; edx = original ebp
    240         mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
    241         test    eax,eax
    242         jz      near .return
    243 
    244         mov     ecx, INT [max_v_samp(edx)]      ; rowctr
    245         test    ecx,ecx
    246         jz      near .return
    247 
    248         mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
    249         mov     edi, POINTER [output_data_ptr(edx)]
    250         mov     edi, JSAMPARRAY [edi]                   ; output_data
    251         alignx  16,7
    252 .rowloop:
    253         push    eax                                     ; colctr
    254         push    ecx
    255         push    edi
    256         push    esi
    257 
    258         mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
    259         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    260         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
    261         mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
    262         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
    263 
    264         test    eax, SIZEOF_MMWORD-1
    265         jz      short .skip
    266         push    edx
    267         mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
    268         mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
    269         mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
    270         mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
    271         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    272         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    273         pop     edx
    274 .skip:
    275         ; -- process the first column block
    276 
    277         movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
    278         movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
    279         movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
    280 
    281         pushpic ebx
    282         movpic  ebx, POINTER [gotptr]   ; load GOT address
    283 
    284         pxor      mm3,mm3               ; mm3=(all 0's)
    285         movq      mm4,mm0
    286         punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
    287         punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
    288         movq      mm5,mm1
    289         punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
    290         punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
    291         movq      mm6,mm2
    292         punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
    293         punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
    294 
    295         pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
    296         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
    297 
    298         pcmpeqb mm7,mm7
    299         psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
    300 
    301         paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
    302         paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
    303         paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
    304         paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
    305 
    306         movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
    307         movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
    308         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
    309         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
    310 
    311         pand    mm1,mm7                 ; mm1=( 0 - - -)
    312         pand    mm2,mm7                 ; mm2=( 0 - - -)
    313 
    314         movq    MMWORD [wk(0)], mm1
    315         movq    MMWORD [wk(1)], mm2
    316 
    317         poppic  ebx
    318 
    319         add     eax, byte SIZEOF_MMWORD-1
    320         and     eax, byte -SIZEOF_MMWORD
    321         cmp     eax, byte SIZEOF_MMWORD
    322         ja      short .columnloop
    323         alignx  16,7
    324 
    325 .columnloop_last:
    326         ; -- process the last column block
    327 
    328         pushpic ebx
    329         movpic  ebx, POINTER [gotptr]   ; load GOT address
    330 
    331         pcmpeqb mm1,mm1
    332         psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
    333         movq    mm2,mm1
    334 
    335         pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
    336         pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
    337 
    338         movq    MMWORD [wk(2)], mm1
    339         movq    MMWORD [wk(3)], mm2
    340 
    341         jmp     short .upsample
    342         alignx  16,7
    343 
    344 .columnloop:
    345         ; -- process the next column block
    346 
    347         movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
    348         movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
    349         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
    350 
    351         pushpic ebx
    352         movpic  ebx, POINTER [gotptr]   ; load GOT address
    353 
    354         pxor      mm3,mm3               ; mm3=(all 0's)
    355         movq      mm4,mm0
    356         punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
    357         punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
    358         movq      mm5,mm1
    359         punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
    360         punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
    361         movq      mm6,mm2
    362         punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
    363         punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
    364 
    365         pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
    366         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
    367 
    368         paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
    369         paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
    370         paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
    371         paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
    372 
    373         movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
    374         movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
    375         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
    376         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
    377 
    378         psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
    379         psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
    380 
    381         movq    MMWORD [wk(2)], mm1
    382         movq    MMWORD [wk(3)], mm2
    383 
    384 .upsample:
    385         ; -- process the upper row
    386 
    387         movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
    388         movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
    389 
    390         movq    mm0,mm7
    391         movq    mm4,mm3
    392         psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
    393         psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
    394         movq    mm5,mm7
    395         movq    mm6,mm3
    396         psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
    397         psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
    398 
    399         por     mm0,mm4                         ; mm0=( 1 2 3 4)
    400         por     mm5,mm6                         ; mm5=( 3 4 5 6)
    401 
    402         movq    mm1,mm7
    403         movq    mm2,mm3
    404         psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
    405         psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
    406         movq    mm4,mm3
    407         psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
    408 
    409         por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
    410         por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
    411 
    412         movq    MMWORD [wk(0)], mm4
    413 
    414         pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
    415         pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
    416         paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
    417         paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
    418         paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
    419         paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
    420 
    421         paddw   mm1,mm7
    422         paddw   mm5,mm3
    423         psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
    424         psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
    425         paddw   mm0,mm7
    426         paddw   mm2,mm3
    427         psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
    428         psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
    429 
    430         psllw   mm0,BYTE_BIT
    431         psllw   mm2,BYTE_BIT
    432         por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
    433         por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
    434 
    435         movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
    436         movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
    437 
    438         ; -- process the lower row
    439 
    440         movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
    441         movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
    442 
    443         movq    mm7,mm6
    444         movq    mm3,mm4
    445         psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
    446         psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
    447         movq    mm0,mm6
    448         movq    mm2,mm4
    449         psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
    450         psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
    451 
    452         por     mm7,mm3                         ; mm7=( 1 2 3 4)
    453         por     mm0,mm2                         ; mm0=( 3 4 5 6)
    454 
    455         movq    mm1,mm6
    456         movq    mm5,mm4
    457         psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
    458         psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
    459         movq    mm3,mm4
    460         psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
    461 
    462         por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
    463         por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
    464 
    465         movq    MMWORD [wk(1)], mm3
    466 
    467         pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
    468         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
    469         paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
    470         paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
    471         paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
    472         paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
    473 
    474         paddw   mm1,mm6
    475         paddw   mm0,mm4
    476         psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
    477         psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
    478         paddw   mm7,mm6
    479         paddw   mm5,mm4
    480         psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
    481         psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
    482 
    483         psllw   mm7,BYTE_BIT
    484         psllw   mm5,BYTE_BIT
    485         por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
    486         por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
    487 
    488         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
    489         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
    490 
    491         poppic  ebx
    492 
    493         sub     eax, byte SIZEOF_MMWORD
    494         add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
    495         add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
    496         add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
    497         add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
    498         add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
    499         cmp     eax, byte SIZEOF_MMWORD
    500         ja      near .columnloop
    501         test    eax,eax
    502         jnz     near .columnloop_last
    503 
    504         pop     esi
    505         pop     edi
    506         pop     ecx
    507         pop     eax
    508 
    509         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
    510         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
    511         sub     ecx, byte 2                     ; rowctr
    512         jg      near .rowloop
    513 
    514         emms            ; empty MMX state
    515 
    516 .return:
    517         pop     edi
    518         pop     esi
    519 ;       pop     edx             ; need not be preserved
    520 ;       pop     ecx             ; need not be preserved
    521         pop     ebx
    522         mov     esp,ebp         ; esp <- aligned ebp
    523         pop     esp             ; esp <- original ebp
    524         pop     ebp
    525         ret
    526 
    527 ; --------------------------------------------------------------------------
    528 ;
    529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    530 ; It's still a box filter.
    531 ;
    532 ; GLOBAL(void)
    533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
    534 ;                          JDIMENSION output_width,
    535 ;                          JSAMPARRAY input_data,
    536 ;                          JSAMPARRAY * output_data_ptr);
    537 ;
    538 
    539 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    540 %define output_width(b)         (b)+12          ; JDIMENSION output_width
    541 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    542 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
    543 
    544         align   16
    545         global  EXTN(jsimd_h2v1_upsample_mmx)
    546 
    547 EXTN(jsimd_h2v1_upsample_mmx):
    548         push    ebp
    549         mov     ebp,esp
    550 ;       push    ebx             ; unused
    551 ;       push    ecx             ; need not be preserved
    552 ;       push    edx             ; need not be preserved
    553         push    esi
    554         push    edi
    555 
    556         mov     edx, JDIMENSION [output_width(ebp)]
    557         add     edx, byte (2*SIZEOF_MMWORD)-1
    558         and     edx, byte -(2*SIZEOF_MMWORD)
    559         jz      short .return
    560 
    561         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
    562         test    ecx,ecx
    563         jz      short .return
    564 
    565         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    566         mov     edi, POINTER [output_data_ptr(ebp)]
    567         mov     edi, JSAMPARRAY [edi]                   ; output_data
    568         alignx  16,7
    569 .rowloop:
    570         push    edi
    571         push    esi
    572 
    573         mov     esi, JSAMPROW [esi]             ; inptr
    574         mov     edi, JSAMPROW [edi]             ; outptr
    575         mov     eax,edx                         ; colctr
    576         alignx  16,7
    577 .columnloop:
    578 
    579         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    580 
    581         movq      mm1,mm0
    582         punpcklbw mm0,mm0
    583         punpckhbw mm1,mm1
    584 
    585         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    586         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
    587 
    588         sub     eax, byte 2*SIZEOF_MMWORD
    589         jz      short .nextrow
    590 
    591         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    592 
    593         movq      mm3,mm2
    594         punpcklbw mm2,mm2
    595         punpckhbw mm3,mm3
    596 
    597         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
    598         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
    599 
    600         sub     eax, byte 2*SIZEOF_MMWORD
    601         jz      short .nextrow
    602 
    603         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
    604         add     edi, byte 4*SIZEOF_MMWORD       ; outptr
    605         jmp     short .columnloop
    606         alignx  16,7
    607 
    608 .nextrow:
    609         pop     esi
    610         pop     edi
    611 
    612         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    613         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    614         dec     ecx                             ; rowctr
    615         jg      short .rowloop
    616 
    617         emms            ; empty MMX state
    618 
    619 .return:
    620         pop     edi
    621         pop     esi
    622 ;       pop     edx             ; need not be preserved
    623 ;       pop     ecx             ; need not be preserved
    624 ;       pop     ebx             ; unused
    625         pop     ebp
    626         ret
    627 
    628 ; --------------------------------------------------------------------------
    629 ;
    630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    631 ; It's still a box filter.
    632 ;
    633 ; GLOBAL(void)
    634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
    635 ;                          JDIMENSION output_width,
    636 ;                          JSAMPARRAY input_data,
    637 ;                          JSAMPARRAY * output_data_ptr);
    638 ;
    639 
    640 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    641 %define output_width(b)         (b)+12          ; JDIMENSION output_width
    642 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    643 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
    644 
    645         align   16
    646         global  EXTN(jsimd_h2v2_upsample_mmx)
    647 
    648 EXTN(jsimd_h2v2_upsample_mmx):
    649         push    ebp
    650         mov     ebp,esp
    651         push    ebx
    652 ;       push    ecx             ; need not be preserved
    653 ;       push    edx             ; need not be preserved
    654         push    esi
    655         push    edi
    656 
    657         mov     edx, JDIMENSION [output_width(ebp)]
    658         add     edx, byte (2*SIZEOF_MMWORD)-1
    659         and     edx, byte -(2*SIZEOF_MMWORD)
    660         jz      near .return
    661 
    662         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
    663         test    ecx,ecx
    664         jz      short .return
    665 
    666         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    667         mov     edi, POINTER [output_data_ptr(ebp)]
    668         mov     edi, JSAMPARRAY [edi]                   ; output_data
    669         alignx  16,7
    670 .rowloop:
    671         push    edi
    672         push    esi
    673 
    674         mov     esi, JSAMPROW [esi]                     ; inptr
    675         mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
    676         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
    677         mov     eax,edx                                 ; colctr
    678         alignx  16,7
    679 .columnloop:
    680 
    681         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
    682 
    683         movq      mm1,mm0
    684         punpcklbw mm0,mm0
    685         punpckhbw mm1,mm1
    686 
    687         movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
    688         movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
    689         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
    690         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
    691 
    692         sub     eax, byte 2*SIZEOF_MMWORD
    693         jz      short .nextrow
    694 
    695         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
    696 
    697         movq      mm3,mm2
    698         punpcklbw mm2,mm2
    699         punpckhbw mm3,mm3
    700 
    701         movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
    702         movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
    703         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
    704         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
    705 
    706         sub     eax, byte 2*SIZEOF_MMWORD
    707         jz      short .nextrow
    708 
    709         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
    710         add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
    711         add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
    712         jmp     short .columnloop
    713         alignx  16,7
    714 
    715 .nextrow:
    716         pop     esi
    717         pop     edi
    718 
    719         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
    720         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
    721         sub     ecx, byte 2                     ; rowctr
    722         jg      short .rowloop
    723 
    724         emms            ; empty MMX state
    725 
    726 .return:
    727         pop     edi
    728         pop     esi
    729 ;       pop     edx             ; need not be preserved
    730 ;       pop     ecx             ; need not be preserved
    731         pop     ebx
    732         pop     ebp
    733         ret
    734 
    735 ; For some reason, the OS X linker does not honor the request to align the
    736 ; segment unless we do this.
    737         align   16
    738