Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jdmrgext.asm - merged upsampling/color conversion (MMX)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jcolsamp.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 ;
     23 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_h2v1_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
     27 ;                                JDIMENSION in_row_group_ctr,
     28 ;                                JSAMPARRAY output_buf);
     29 ;
     30 
     31 %define output_width(b)      (b) + 8    ; JDIMENSION output_width
     32 %define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
     33 %define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
     34 %define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
     35 
     36 %define original_ebp  ebp + 0
     37 %define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
     38 %define WK_NUM        3
     39 %define gotptr        wk(0) - SIZEOF_POINTER  ; void * gotptr
     40 
     41     align       32
     42     GLOBAL_FUNCTION(jsimd_h2v1_merged_upsample_mmx)
     43 
     44 EXTN(jsimd_h2v1_merged_upsample_mmx):
     45     push        ebp
     46     mov         eax, esp                    ; eax = original ebp
     47     sub         esp, byte 4
     48     and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
     49     mov         [esp], eax
     50     mov         ebp, esp                    ; ebp = aligned ebp
     51     lea         esp, [wk(0)]
     52     pushpic     eax                     ; make a room for GOT address
     53     push        ebx
     54 ;   push        ecx                     ; need not be preserved
     55 ;   push        edx                     ; need not be preserved
     56     push        esi
     57     push        edi
     58 
     59     get_GOT     ebx                     ; get GOT address
     60     movpic      POINTER [gotptr], ebx   ; save GOT address
     61 
     62     mov         ecx, JDIMENSION [output_width(eax)]  ; col
     63     test        ecx, ecx
     64     jz          near .return
     65 
     66     push        ecx
     67 
     68     mov         edi, JSAMPIMAGE [input_buf(eax)]
     69     mov         ecx, JDIMENSION [in_row_group_ctr(eax)]
     70     mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
     71     mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
     72     mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
     73     mov         edi, JSAMPARRAY [output_buf(eax)]
     74     mov         esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]  ; inptr0
     75     mov         ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]  ; inptr1
     76     mov         edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]  ; inptr2
     77     mov         edi, JSAMPROW [edi]                      ; outptr
     78 
     79     pop         ecx                     ; col
     80 
     81     alignx      16, 7
     82 .columnloop:
     83     movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
     84 
     85     movq        mm6, MMWORD [ebx]       ; mm6=Cb(01234567)
     86     movq        mm7, MMWORD [edx]       ; mm7=Cr(01234567)
     87 
     88     pxor        mm1, mm1                ; mm1=(all 0's)
     89     pcmpeqw     mm3, mm3
     90     psllw       mm3, 7                  ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
     91 
     92     movq        mm4, mm6
     93     punpckhbw   mm6, mm1                ; mm6=Cb(4567)=CbH
     94     punpcklbw   mm4, mm1                ; mm4=Cb(0123)=CbL
     95     movq        mm0, mm7
     96     punpckhbw   mm7, mm1                ; mm7=Cr(4567)=CrH
     97     punpcklbw   mm0, mm1                ; mm0=Cr(0123)=CrL
     98 
     99     paddw       mm6, mm3
    100     paddw       mm4, mm3
    101     paddw       mm7, mm3
    102     paddw       mm0, mm3
    103 
    104     ; (Original)
    105     ; R = Y                + 1.40200 * Cr
    106     ; G = Y - 0.34414 * Cb - 0.71414 * Cr
    107     ; B = Y + 1.77200 * Cb
    108     ;
    109     ; (This implementation)
    110     ; R = Y                + 0.40200 * Cr + Cr
    111     ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
    112     ; B = Y - 0.22800 * Cb + Cb + Cb
    113 
    114     movq        mm5, mm6                ; mm5=CbH
    115     movq        mm2, mm4                ; mm2=CbL
    116     paddw       mm6, mm6                ; mm6=2*CbH
    117     paddw       mm4, mm4                ; mm4=2*CbL
    118     movq        mm1, mm7                ; mm1=CrH
    119     movq        mm3, mm0                ; mm3=CrL
    120     paddw       mm7, mm7                ; mm7=2*CrH
    121     paddw       mm0, mm0                ; mm0=2*CrL
    122 
    123     pmulhw      mm6, [GOTOFF(eax,PW_MF0228)]  ; mm6=(2*CbH * -FIX(0.22800))
    124     pmulhw      mm4, [GOTOFF(eax,PW_MF0228)]  ; mm4=(2*CbL * -FIX(0.22800))
    125     pmulhw      mm7, [GOTOFF(eax,PW_F0402)]   ; mm7=(2*CrH * FIX(0.40200))
    126     pmulhw      mm0, [GOTOFF(eax,PW_F0402)]   ; mm0=(2*CrL * FIX(0.40200))
    127 
    128     paddw       mm6, [GOTOFF(eax,PW_ONE)]
    129     paddw       mm4, [GOTOFF(eax,PW_ONE)]
    130     psraw       mm6, 1                  ; mm6=(CbH * -FIX(0.22800))
    131     psraw       mm4, 1                  ; mm4=(CbL * -FIX(0.22800))
    132     paddw       mm7, [GOTOFF(eax,PW_ONE)]
    133     paddw       mm0, [GOTOFF(eax,PW_ONE)]
    134     psraw       mm7, 1                  ; mm7=(CrH * FIX(0.40200))
    135     psraw       mm0, 1                  ; mm0=(CrL * FIX(0.40200))
    136 
    137     paddw       mm6, mm5
    138     paddw       mm4, mm2
    139     paddw       mm6, mm5                ; mm6=(CbH * FIX(1.77200))=(B-Y)H
    140     paddw       mm4, mm2                ; mm4=(CbL * FIX(1.77200))=(B-Y)L
    141     paddw       mm7, mm1                ; mm7=(CrH * FIX(1.40200))=(R-Y)H
    142     paddw       mm0, mm3                ; mm0=(CrL * FIX(1.40200))=(R-Y)L
    143 
    144     movq        MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
    145     movq        MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
    146 
    147     movq        mm6, mm5
    148     movq        mm7, mm2
    149     punpcklwd   mm5, mm1
    150     punpckhwd   mm6, mm1
    151     pmaddwd     mm5, [GOTOFF(eax,PW_MF0344_F0285)]
    152     pmaddwd     mm6, [GOTOFF(eax,PW_MF0344_F0285)]
    153     punpcklwd   mm2, mm3
    154     punpckhwd   mm7, mm3
    155     pmaddwd     mm2, [GOTOFF(eax,PW_MF0344_F0285)]
    156     pmaddwd     mm7, [GOTOFF(eax,PW_MF0344_F0285)]
    157 
    158     paddd       mm5, [GOTOFF(eax,PD_ONEHALF)]
    159     paddd       mm6, [GOTOFF(eax,PD_ONEHALF)]
    160     psrad       mm5, SCALEBITS
    161     psrad       mm6, SCALEBITS
    162     paddd       mm2, [GOTOFF(eax,PD_ONEHALF)]
    163     paddd       mm7, [GOTOFF(eax,PD_ONEHALF)]
    164     psrad       mm2, SCALEBITS
    165     psrad       mm7, SCALEBITS
    166 
    167     packssdw    mm5, mm6                ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
    168     packssdw    mm2, mm7                ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
    169     psubw       mm5, mm1                ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
    170     psubw       mm2, mm3                ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
    171 
    172     movq        MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
    173 
    174     mov         al, 2                   ; Yctr
    175     jmp         short .Yloop_1st
    176     alignx      16, 7
    177 
    178 .Yloop_2nd:
    179     movq        mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
    180     movq        mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
    181     movq        mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
    182     alignx      16, 7
    183 
    184 .Yloop_1st:
    185     movq        mm7, MMWORD [esi]       ; mm7=Y(01234567)
    186 
    187     pcmpeqw     mm6, mm6
    188     psrlw       mm6, BYTE_BIT           ; mm6={0xFF 0x00 0xFF 0x00 ..}
    189     pand        mm6, mm7                ; mm6=Y(0246)=YE
    190     psrlw       mm7, BYTE_BIT           ; mm7=Y(1357)=YO
    191 
    192     movq        mm1, mm0                ; mm1=mm0=(R-Y)(L/H)
    193     movq        mm3, mm2                ; mm3=mm2=(G-Y)(L/H)
    194     movq        mm5, mm4                ; mm5=mm4=(B-Y)(L/H)
    195 
    196     paddw       mm0, mm6                ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
    197     paddw       mm1, mm7                ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
    198     packuswb    mm0, mm0                ; mm0=(R0 R2 R4 R6 ** ** ** **)
    199     packuswb    mm1, mm1                ; mm1=(R1 R3 R5 R7 ** ** ** **)
    200 
    201     paddw       mm2, mm6                ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
    202     paddw       mm3, mm7                ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
    203     packuswb    mm2, mm2                ; mm2=(G0 G2 G4 G6 ** ** ** **)
    204     packuswb    mm3, mm3                ; mm3=(G1 G3 G5 G7 ** ** ** **)
    205 
    206     paddw       mm4, mm6                ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
    207     paddw       mm5, mm7                ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
    208     packuswb    mm4, mm4                ; mm4=(B0 B2 B4 B6 ** ** ** **)
    209     packuswb    mm5, mm5                ; mm5=(B1 B3 B5 B7 ** ** ** **)
    210 
    211 %if RGB_PIXELSIZE == 3  ; ---------------
    212 
    213     ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
    214     ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
    215     ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
    216     ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
    217 
    218     punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
    219     punpcklbw   mmE, mmB                ; mmE=(20 01 22 03 24 05 26 07)
    220     punpcklbw   mmD, mmF                ; mmD=(11 21 13 23 15 25 17 27)
    221 
    222     movq        mmG, mmA
    223     movq        mmH, mmA
    224     punpcklwd   mmA, mmE                ; mmA=(00 10 20 01 02 12 22 03)
    225     punpckhwd   mmG, mmE                ; mmG=(04 14 24 05 06 16 26 07)
    226 
    227     psrlq       mmH, 2*BYTE_BIT         ; mmH=(02 12 04 14 06 16 -- --)
    228     psrlq       mmE, 2*BYTE_BIT         ; mmE=(22 03 24 05 26 07 -- --)
    229 
    230     movq        mmC, mmD
    231     movq        mmB, mmD
    232     punpcklwd   mmD, mmH                ; mmD=(11 21 02 12 13 23 04 14)
    233     punpckhwd   mmC, mmH                ; mmC=(15 25 06 16 17 27 -- --)
    234 
    235     psrlq       mmB, 2*BYTE_BIT         ; mmB=(13 23 15 25 17 27 -- --)
    236 
    237     movq        mmF, mmE
    238     punpcklwd   mmE, mmB                ; mmE=(22 03 13 23 24 05 15 25)
    239     punpckhwd   mmF, mmB                ; mmF=(26 07 17 27 -- -- -- --)
    240 
    241     punpckldq   mmA, mmD                ; mmA=(00 10 20 01 11 21 02 12)
    242     punpckldq   mmE, mmG                ; mmE=(22 03 13 23 04 14 24 05)
    243     punpckldq   mmC, mmF                ; mmC=(15 25 06 16 26 07 17 27)
    244 
    245     cmp         ecx, byte SIZEOF_MMWORD
    246     jb          short .column_st16
    247 
    248     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    249     movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
    250     movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
    251 
    252     sub         ecx, byte SIZEOF_MMWORD
    253     jz          near .endcolumn
    254 
    255     add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
    256     add         esi, byte SIZEOF_MMWORD                ; inptr0
    257     dec         al                                     ; Yctr
    258     jnz         near .Yloop_2nd
    259 
    260     add         ebx, byte SIZEOF_MMWORD                ; inptr1
    261     add         edx, byte SIZEOF_MMWORD                ; inptr2
    262     jmp         near .columnloop
    263     alignx      16, 7
    264 
    265 .column_st16:
    266     lea         ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
    267     cmp         ecx, byte 2*SIZEOF_MMWORD
    268     jb          short .column_st8
    269     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    270     movq        MMWORD [edi+1*SIZEOF_MMWORD], mmE
    271     movq        mmA, mmC
    272     sub         ecx, byte 2*SIZEOF_MMWORD
    273     add         edi, byte 2*SIZEOF_MMWORD
    274     jmp         short .column_st4
    275 .column_st8:
    276     cmp         ecx, byte SIZEOF_MMWORD
    277     jb          short .column_st4
    278     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    279     movq        mmA, mmE
    280     sub         ecx, byte SIZEOF_MMWORD
    281     add         edi, byte SIZEOF_MMWORD
    282 .column_st4:
    283     movd        eax, mmA
    284     cmp         ecx, byte SIZEOF_DWORD
    285     jb          short .column_st2
    286     mov         DWORD [edi+0*SIZEOF_DWORD], eax
    287     psrlq       mmA, DWORD_BIT
    288     movd        eax, mmA
    289     sub         ecx, byte SIZEOF_DWORD
    290     add         edi, byte SIZEOF_DWORD
    291 .column_st2:
    292     cmp         ecx, byte SIZEOF_WORD
    293     jb          short .column_st1
    294     mov         WORD [edi+0*SIZEOF_WORD], ax
    295     shr         eax, WORD_BIT
    296     sub         ecx, byte SIZEOF_WORD
    297     add         edi, byte SIZEOF_WORD
    298 .column_st1:
    299     cmp         ecx, byte SIZEOF_BYTE
    300     jb          short .endcolumn
    301     mov         BYTE [edi+0*SIZEOF_BYTE], al
    302 
    303 %else  ; RGB_PIXELSIZE == 4 ; -----------
    304 
    305 %ifdef RGBX_FILLER_0XFF
    306     pcmpeqb     mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
    307     pcmpeqb     mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
    308 %else
    309     pxor        mm6, mm6                ; mm6=(X0 X2 X4 X6 ** ** ** **)
    310     pxor        mm7, mm7                ; mm7=(X1 X3 X5 X7 ** ** ** **)
    311 %endif
    312     ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
    313     ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
    314     ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
    315     ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
    316 
    317     punpcklbw   mmA, mmC                ; mmA=(00 10 02 12 04 14 06 16)
    318     punpcklbw   mmE, mmG                ; mmE=(20 30 22 32 24 34 26 36)
    319     punpcklbw   mmB, mmD                ; mmB=(01 11 03 13 05 15 07 17)
    320     punpcklbw   mmF, mmH                ; mmF=(21 31 23 33 25 35 27 37)
    321 
    322     movq        mmC, mmA
    323     punpcklwd   mmA, mmE                ; mmA=(00 10 20 30 02 12 22 32)
    324     punpckhwd   mmC, mmE                ; mmC=(04 14 24 34 06 16 26 36)
    325     movq        mmG, mmB
    326     punpcklwd   mmB, mmF                ; mmB=(01 11 21 31 03 13 23 33)
    327     punpckhwd   mmG, mmF                ; mmG=(05 15 25 35 07 17 27 37)
    328 
    329     movq        mmD, mmA
    330     punpckldq   mmA, mmB                ; mmA=(00 10 20 30 01 11 21 31)
    331     punpckhdq   mmD, mmB                ; mmD=(02 12 22 32 03 13 23 33)
    332     movq        mmH, mmC
    333     punpckldq   mmC, mmG                ; mmC=(04 14 24 34 05 15 25 35)
    334     punpckhdq   mmH, mmG                ; mmH=(06 16 26 36 07 17 27 37)
    335 
    336     cmp         ecx, byte SIZEOF_MMWORD
    337     jb          short .column_st16
    338 
    339     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    340     movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
    341     movq        MMWORD [edi+2*SIZEOF_MMWORD], mmC
    342     movq        MMWORD [edi+3*SIZEOF_MMWORD], mmH
    343 
    344     sub         ecx, byte SIZEOF_MMWORD
    345     jz          short .endcolumn
    346 
    347     add         edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; outptr
    348     add         esi, byte SIZEOF_MMWORD                ; inptr0
    349     dec         al                                     ; Yctr
    350     jnz         near .Yloop_2nd
    351 
    352     add         ebx, byte SIZEOF_MMWORD                ; inptr1
    353     add         edx, byte SIZEOF_MMWORD                ; inptr2
    354     jmp         near .columnloop
    355     alignx      16, 7
    356 
    357 .column_st16:
    358     cmp         ecx, byte SIZEOF_MMWORD/2
    359     jb          short .column_st8
    360     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    361     movq        MMWORD [edi+1*SIZEOF_MMWORD], mmD
    362     movq        mmA, mmC
    363     movq        mmD, mmH
    364     sub         ecx, byte SIZEOF_MMWORD/2
    365     add         edi, byte 2*SIZEOF_MMWORD
    366 .column_st8:
    367     cmp         ecx, byte SIZEOF_MMWORD/4
    368     jb          short .column_st4
    369     movq        MMWORD [edi+0*SIZEOF_MMWORD], mmA
    370     movq        mmA, mmD
    371     sub         ecx, byte SIZEOF_MMWORD/4
    372     add         edi, byte 1*SIZEOF_MMWORD
    373 .column_st4:
    374     cmp         ecx, byte SIZEOF_MMWORD/8
    375     jb          short .endcolumn
    376     movd        DWORD [edi+0*SIZEOF_DWORD], mmA
    377 
    378 %endif  ; RGB_PIXELSIZE ; ---------------
    379 
    380 .endcolumn:
    381     emms                                ; empty MMX state
    382 
    383 .return:
    384     pop         edi
    385     pop         esi
    386 ;   pop         edx                     ; need not be preserved
    387 ;   pop         ecx                     ; need not be preserved
    388     pop         ebx
    389     mov         esp, ebp                ; esp <- aligned ebp
    390     pop         esp                     ; esp <- original ebp
    391     pop         ebp
    392     ret
    393 
    394 ; --------------------------------------------------------------------------
    395 ;
    396 ; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
    397 ;
    398 ; GLOBAL(void)
    399 ; jsimd_h2v2_merged_upsample_mmx(JDIMENSION output_width, JSAMPIMAGE input_buf,
    400 ;                                JDIMENSION in_row_group_ctr,
    401 ;                                JSAMPARRAY output_buf);
    402 ;
    403 
    404 %define output_width(b)      (b) + 8    ; JDIMENSION output_width
    405 %define input_buf(b)         (b) + 12   ; JSAMPIMAGE input_buf
    406 %define in_row_group_ctr(b)  (b) + 16   ; JDIMENSION in_row_group_ctr
    407 %define output_buf(b)        (b) + 20   ; JSAMPARRAY output_buf
    408 
    409     align       32
    410     GLOBAL_FUNCTION(jsimd_h2v2_merged_upsample_mmx)
    411 
    412 EXTN(jsimd_h2v2_merged_upsample_mmx):
    413     push        ebp
    414     mov         ebp, esp
    415     push        ebx
    416 ;   push        ecx                     ; need not be preserved
    417 ;   push        edx                     ; need not be preserved
    418     push        esi
    419     push        edi
    420 
    421     mov         eax, JDIMENSION [output_width(ebp)]
    422 
    423     mov         edi, JSAMPIMAGE [input_buf(ebp)]
    424     mov         ecx, JDIMENSION [in_row_group_ctr(ebp)]
    425     mov         esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
    426     mov         ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
    427     mov         edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
    428     mov         edi, JSAMPARRAY [output_buf(ebp)]
    429     lea         esi, [esi+ecx*SIZEOF_JSAMPROW]
    430 
    431     push        edx                     ; inptr2
    432     push        ebx                     ; inptr1
    433     push        esi                     ; inptr00
    434     mov         ebx, esp
    435 
    436     push        edi                     ; output_buf (outptr0)
    437     push        ecx                     ; in_row_group_ctr
    438     push        ebx                     ; input_buf
    439     push        eax                     ; output_width
    440 
    441     call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
    442 
    443     add         esi, byte SIZEOF_JSAMPROW  ; inptr01
    444     add         edi, byte SIZEOF_JSAMPROW  ; outptr1
    445     mov         POINTER [ebx+0*SIZEOF_POINTER], esi
    446     mov         POINTER [ebx-1*SIZEOF_POINTER], edi
    447 
    448     call        near EXTN(jsimd_h2v1_merged_upsample_mmx)
    449 
    450     add         esp, byte 7*SIZEOF_DWORD
    451 
    452     pop         edi
    453     pop         esi
    454 ;   pop         edx                     ; need not be preserved
    455 ;   pop         ecx                     ; need not be preserved
    456     pop         ebx
    457     pop         ebp
    458     ret
    459 
    460 ; For some reason, the OS X linker does not honor the request to align the
    461 ; segment unless we do this.
    462     align       32
    463