Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsample.asm - upsampling (SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ;
      6 ; Based on the x86 SIMD extension for IJG JPEG library
      7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      9 ;
     10 ; This file should be assembled with NASM (Netwide Assembler),
     11 ; can *not* be assembled with Microsoft's MASM or any compatible
     12 ; assembler (including Borland's Turbo Assembler).
     13 ; NASM is available from http://nasm.sourceforge.net/ or
     14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     15 ;
     16 ; [TAB8]
     17 
     18 %include "jsimdext.inc"
     19 
     20 ; --------------------------------------------------------------------------
     21         SECTION SEG_CONST
     22 
     23         alignz  16
     24         global  EXTN(jconst_fancy_upsample_sse2)
     25 
     26 EXTN(jconst_fancy_upsample_sse2):
     27 
     28 PW_ONE          times 8 dw  1
     29 PW_TWO          times 8 dw  2
     30 PW_THREE        times 8 dw  3
     31 PW_SEVEN        times 8 dw  7
     32 PW_EIGHT        times 8 dw  8
     33 
     34         alignz  16
     35 
     36 ; --------------------------------------------------------------------------
     37         SECTION SEG_TEXT
     38         BITS    32
     39 ;
     40 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     41 ;
     42 ; The upsampling algorithm is linear interpolation between pixel centers,
     43 ; also known as a "triangle filter".  This is a good compromise between
     44 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     45 ; of the way between input pixel centers.
     46 ;
     47 ; GLOBAL(void)
     48 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
     49 ;                                 JDIMENSION downsampled_width,
     50 ;                                 JSAMPARRAY input_data,
     51 ;                                 JSAMPARRAY *output_data_ptr);
     52 ;
     53 
     54 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
     55 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
     56 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
     57 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
     58 
     59         align   16
     60         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
     61 
     62 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     63         push    ebp
     64         mov     ebp,esp
     65         pushpic ebx
     66 ;       push    ecx             ; need not be preserved
     67 ;       push    edx             ; need not be preserved
     68         push    esi
     69         push    edi
     70 
     71         get_GOT ebx             ; get GOT address
     72 
     73         mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
     74         test    eax,eax
     75         jz      near .return
     76 
     77         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
     78         test    ecx,ecx
     79         jz      near .return
     80 
     81         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
     82         mov     edi, POINTER [output_data_ptr(ebp)]
     83         mov     edi, JSAMPARRAY [edi]                   ; output_data
     84         alignx  16,7
     85 .rowloop:
     86         push    eax                     ; colctr
     87         push    edi
     88         push    esi
     89 
     90         mov     esi, JSAMPROW [esi]     ; inptr
     91         mov     edi, JSAMPROW [edi]     ; outptr
     92 
     93         test    eax, SIZEOF_XMMWORD-1
     94         jz      short .skip
     95         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
     96         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     97 .skip:
     98         pxor    xmm0,xmm0               ; xmm0=(all 0's)
     99         pcmpeqb xmm7,xmm7
    100         psrldq  xmm7,(SIZEOF_XMMWORD-1)
    101         pand    xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
    102 
    103         add     eax, byte SIZEOF_XMMWORD-1
    104         and     eax, byte -SIZEOF_XMMWORD
    105         cmp     eax, byte SIZEOF_XMMWORD
    106         ja      short .columnloop
    107         alignx  16,7
    108 
    109 .columnloop_last:
    110         pcmpeqb xmm6,xmm6
    111         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    112         pand    xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
    113         jmp     short .upsample
    114         alignx  16,7
    115 
    116 .columnloop:
    117         movdqa  xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
    118         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    119 
    120 .upsample:
    121         movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    122         movdqa  xmm2,xmm1
    123         movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
    124         pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
    125         psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
    126 
    127         por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
    128         por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
    129 
    130         movdqa  xmm7,xmm1
    131         psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
    132 
    133         movdqa    xmm4,xmm1
    134         punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
    135         punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
    136         movdqa    xmm5,xmm2
    137         punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
    138         punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
    139         movdqa    xmm6,xmm3
    140         punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
    141         punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
    142 
    143         pmullw  xmm1,[GOTOFF(ebx,PW_THREE)]
    144         pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
    145         paddw   xmm2,[GOTOFF(ebx,PW_ONE)]
    146         paddw   xmm5,[GOTOFF(ebx,PW_ONE)]
    147         paddw   xmm3,[GOTOFF(ebx,PW_TWO)]
    148         paddw   xmm6,[GOTOFF(ebx,PW_TWO)]
    149 
    150         paddw   xmm2,xmm1
    151         paddw   xmm5,xmm4
    152         psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    153         psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    154         paddw   xmm3,xmm1
    155         paddw   xmm6,xmm4
    156         psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    157         psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    158 
    159         psllw   xmm3,BYTE_BIT
    160         psllw   xmm6,BYTE_BIT
    161         por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
    162         por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
    163 
    164         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
    165         movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
    166 
    167         sub     eax, byte SIZEOF_XMMWORD
    168         add     esi, byte 1*SIZEOF_XMMWORD      ; inptr
    169         add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
    170         cmp     eax, byte SIZEOF_XMMWORD
    171         ja      near .columnloop
    172         test    eax,eax
    173         jnz     near .columnloop_last
    174 
    175         pop     esi
    176         pop     edi
    177         pop     eax
    178 
    179         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    180         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    181         dec     ecx                             ; rowctr
    182         jg      near .rowloop
    183 
    184 .return:
    185         pop     edi
    186         pop     esi
    187 ;       pop     edx             ; need not be preserved
    188 ;       pop     ecx             ; need not be preserved
    189         poppic  ebx
    190         pop     ebp
    191         ret
    192 
    193 ; --------------------------------------------------------------------------
    194 ;
    195 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    196 ; Again a triangle filter; see comments for h2v1 case, above.
    197 ;
    198 ; GLOBAL(void)
    199 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
    200 ;                                 JDIMENSION downsampled_width,
    201 ;                                 JSAMPARRAY input_data,
    202 ;                                 JSAMPARRAY *output_data_ptr);
    203 ;
    204 
    205 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    206 %define downsamp_width(b)       (b)+12          ; JDIMENSION downsampled_width
    207 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    208 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
    209 
    210 %define original_ebp    ebp+0
    211 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
    212 %define WK_NUM          4
    213 %define gotptr          wk(0)-SIZEOF_POINTER    ; void *gotptr
    214 
    215         align   16
    216         global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
    217 
    218 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    219         push    ebp
    220         mov     eax,esp                         ; eax = original ebp
    221         sub     esp, byte 4
    222         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
    223         mov     [esp],eax
    224         mov     ebp,esp                         ; ebp = aligned ebp
    225         lea     esp, [wk(0)]
    226         pushpic eax             ; make a room for GOT address
    227         push    ebx
    228 ;       push    ecx             ; need not be preserved
    229 ;       push    edx             ; need not be preserved
    230         push    esi
    231         push    edi
    232 
    233         get_GOT ebx                     ; get GOT address
    234         movpic  POINTER [gotptr], ebx   ; save GOT address
    235 
    236         mov     edx,eax                         ; edx = original ebp
    237         mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
    238         test    eax,eax
    239         jz      near .return
    240 
    241         mov     ecx, INT [max_v_samp(edx)]      ; rowctr
    242         test    ecx,ecx
    243         jz      near .return
    244 
    245         mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
    246         mov     edi, POINTER [output_data_ptr(edx)]
    247         mov     edi, JSAMPARRAY [edi]                   ; output_data
    248         alignx  16,7
    249 .rowloop:
    250         push    eax                                     ; colctr
    251         push    ecx
    252         push    edi
    253         push    esi
    254 
    255         mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
    256         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
    257         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
    258         mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
    259         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
    260 
    261         test    eax, SIZEOF_XMMWORD-1
    262         jz      short .skip
    263         push    edx
    264         mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
    265         mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
    266         mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
    267         mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
    268         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    269         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    270         pop     edx
    271 .skip:
    272         ; -- process the first column block
    273 
    274         movdqa  xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
    275         movdqa  xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
    276         movdqa  xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
    277 
    278         pushpic ebx
    279         movpic  ebx, POINTER [gotptr]   ; load GOT address
    280 
    281         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    282         movdqa    xmm4,xmm0
    283         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    284         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    285         movdqa    xmm5,xmm1
    286         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    287         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    288         movdqa    xmm6,xmm2
    289         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    290         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    291 
    292         pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
    293         pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
    294 
    295         pcmpeqb xmm7,xmm7
    296         psrldq  xmm7,(SIZEOF_XMMWORD-2)
    297 
    298         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    299         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    300         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    301         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    302 
    303         movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
    304         movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    305         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
    306         movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
    307 
    308         pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
    309         pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
    310 
    311         movdqa  XMMWORD [wk(0)], xmm1
    312         movdqa  XMMWORD [wk(1)], xmm2
    313 
    314         poppic  ebx
    315 
    316         add     eax, byte SIZEOF_XMMWORD-1
    317         and     eax, byte -SIZEOF_XMMWORD
    318         cmp     eax, byte SIZEOF_XMMWORD
    319         ja      short .columnloop
    320         alignx  16,7
    321 
    322 .columnloop_last:
    323         ; -- process the last column block
    324 
    325         pushpic ebx
    326         movpic  ebx, POINTER [gotptr]   ; load GOT address
    327 
    328         pcmpeqb xmm1,xmm1
    329         pslldq  xmm1,(SIZEOF_XMMWORD-2)
    330         movdqa  xmm2,xmm1
    331 
    332         pand    xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
    333         pand    xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
    334 
    335         movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
    336         movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
    337 
    338         jmp     near .upsample
    339         alignx  16,7
    340 
    341 .columnloop:
    342         ; -- process the next column block
    343 
    344         movdqa  xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
    345         movdqa  xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
    346         movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
    347 
    348         pushpic ebx
    349         movpic  ebx, POINTER [gotptr]   ; load GOT address
    350 
    351         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    352         movdqa    xmm4,xmm0
    353         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    354         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    355         movdqa    xmm5,xmm1
    356         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    357         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    358         movdqa    xmm6,xmm2
    359         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    360         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    361 
    362         pmullw  xmm0,[GOTOFF(ebx,PW_THREE)]
    363         pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
    364 
    365         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    366         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    367         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    368         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    369 
    370         movdqa  XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
    371         movdqa  XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    372         movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    373         movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
    374 
    375         pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
    376         pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
    377 
    378         movdqa  XMMWORD [wk(2)], xmm1
    379         movdqa  XMMWORD [wk(3)], xmm2
    380 
    381 .upsample:
    382         ; -- process the upper row
    383 
    384         movdqa  xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
    385         movdqa  xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
    386 
    387         movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    388         movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    389         psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
    390         pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
    391         movdqa  xmm5,xmm7
    392         movdqa  xmm6,xmm3
    393         psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
    394         pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
    395 
    396         por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
    397         por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
    398 
    399         movdqa  xmm1,xmm7
    400         movdqa  xmm2,xmm3
    401         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    402         psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
    403         movdqa  xmm4,xmm3
    404         psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
    405 
    406         por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    407         por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
    408 
    409         movdqa  XMMWORD [wk(0)], xmm4
    410 
    411         pmullw  xmm7,[GOTOFF(ebx,PW_THREE)]
    412         pmullw  xmm3,[GOTOFF(ebx,PW_THREE)]
    413         paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
    414         paddw   xmm5,[GOTOFF(ebx,PW_EIGHT)]
    415         paddw   xmm0,[GOTOFF(ebx,PW_SEVEN)]
    416         paddw   xmm2,[GOTOFF(ebx,PW_SEVEN)]
    417 
    418         paddw   xmm1,xmm7
    419         paddw   xmm5,xmm3
    420         psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    421         psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    422         paddw   xmm0,xmm7
    423         paddw   xmm2,xmm3
    424         psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    425         psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    426 
    427         psllw   xmm0,BYTE_BIT
    428         psllw   xmm2,BYTE_BIT
    429         por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    430         por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
    431 
    432         movdqa  XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
    433         movdqa  XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
    434 
    435         ; -- process the lower row
    436 
    437         movdqa  xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
    438         movdqa  xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
    439 
    440         movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    441         movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    442         psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
    443         pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
    444         movdqa  xmm0,xmm6
    445         movdqa  xmm2,xmm4
    446         psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
    447         pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
    448 
    449         por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
    450         por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
    451 
    452         movdqa  xmm1,xmm6
    453         movdqa  xmm5,xmm4
    454         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    455         psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
    456         movdqa  xmm3,xmm4
    457         psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
    458 
    459         por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    460         por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
    461 
    462         movdqa  XMMWORD [wk(1)], xmm3
    463 
    464         pmullw  xmm6,[GOTOFF(ebx,PW_THREE)]
    465         pmullw  xmm4,[GOTOFF(ebx,PW_THREE)]
    466         paddw   xmm1,[GOTOFF(ebx,PW_EIGHT)]
    467         paddw   xmm0,[GOTOFF(ebx,PW_EIGHT)]
    468         paddw   xmm7,[GOTOFF(ebx,PW_SEVEN)]
    469         paddw   xmm5,[GOTOFF(ebx,PW_SEVEN)]
    470 
    471         paddw   xmm1,xmm6
    472         paddw   xmm0,xmm4
    473         psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    474         psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    475         paddw   xmm7,xmm6
    476         paddw   xmm5,xmm4
    477         psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    478         psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    479 
    480         psllw   xmm7,BYTE_BIT
    481         psllw   xmm5,BYTE_BIT
    482         por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    483         por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
    484 
    485         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
    486         movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
    487 
    488         poppic  ebx
    489 
    490         sub     eax, byte SIZEOF_XMMWORD
    491         add     ecx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
    492         add     ebx, byte 1*SIZEOF_XMMWORD      ; inptr0
    493         add     esi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
    494         add     edx, byte 2*SIZEOF_XMMWORD      ; outptr0
    495         add     edi, byte 2*SIZEOF_XMMWORD      ; outptr1
    496         cmp     eax, byte SIZEOF_XMMWORD
    497         ja      near .columnloop
    498         test    eax,eax
    499         jnz     near .columnloop_last
    500 
    501         pop     esi
    502         pop     edi
    503         pop     ecx
    504         pop     eax
    505 
    506         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
    507         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
    508         sub     ecx, byte 2                     ; rowctr
    509         jg      near .rowloop
    510 
    511 .return:
    512         pop     edi
    513         pop     esi
    514 ;       pop     edx             ; need not be preserved
    515 ;       pop     ecx             ; need not be preserved
    516         pop     ebx
    517         mov     esp,ebp         ; esp <- aligned ebp
    518         pop     esp             ; esp <- original ebp
    519         pop     ebp
    520         ret
    521 
    522 ; --------------------------------------------------------------------------
    523 ;
    524 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    525 ; It's still a box filter.
    526 ;
    527 ; GLOBAL(void)
    528 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
    529 ;                           JDIMENSION output_width,
    530 ;                           JSAMPARRAY input_data,
    531 ;                           JSAMPARRAY *output_data_ptr);
    532 ;
    533 
    534 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    535 %define output_width(b)         (b)+12          ; JDIMENSION output_width
    536 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    537 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
    538 
    539         align   16
    540         global  EXTN(jsimd_h2v1_upsample_sse2)
    541 
    542 EXTN(jsimd_h2v1_upsample_sse2):
    543         push    ebp
    544         mov     ebp,esp
    545 ;       push    ebx             ; unused
    546 ;       push    ecx             ; need not be preserved
    547 ;       push    edx             ; need not be preserved
    548         push    esi
    549         push    edi
    550 
    551         mov     edx, JDIMENSION [output_width(ebp)]
    552         add     edx, byte (2*SIZEOF_XMMWORD)-1
    553         and     edx, byte -(2*SIZEOF_XMMWORD)
    554         jz      short .return
    555 
    556         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
    557         test    ecx,ecx
    558         jz      short .return
    559 
    560         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    561         mov     edi, POINTER [output_data_ptr(ebp)]
    562         mov     edi, JSAMPARRAY [edi]                   ; output_data
    563         alignx  16,7
    564 .rowloop:
    565         push    edi
    566         push    esi
    567 
    568         mov     esi, JSAMPROW [esi]             ; inptr
    569         mov     edi, JSAMPROW [edi]             ; outptr
    570         mov     eax,edx                         ; colctr
    571         alignx  16,7
    572 .columnloop:
    573 
    574         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    575 
    576         movdqa    xmm1,xmm0
    577         punpcklbw xmm0,xmm0
    578         punpckhbw xmm1,xmm1
    579 
    580         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    581         movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
    582 
    583         sub     eax, byte 2*SIZEOF_XMMWORD
    584         jz      short .nextrow
    585 
    586         movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
    587 
    588         movdqa    xmm3,xmm2
    589         punpcklbw xmm2,xmm2
    590         punpckhbw xmm3,xmm3
    591 
    592         movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    593         movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
    594 
    595         sub     eax, byte 2*SIZEOF_XMMWORD
    596         jz      short .nextrow
    597 
    598         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
    599         add     edi, byte 4*SIZEOF_XMMWORD      ; outptr
    600         jmp     short .columnloop
    601         alignx  16,7
    602 
    603 .nextrow:
    604         pop     esi
    605         pop     edi
    606 
    607         add     esi, byte SIZEOF_JSAMPROW       ; input_data
    608         add     edi, byte SIZEOF_JSAMPROW       ; output_data
    609         dec     ecx                             ; rowctr
    610         jg      short .rowloop
    611 
    612 .return:
    613         pop     edi
    614         pop     esi
    615 ;       pop     edx             ; need not be preserved
    616 ;       pop     ecx             ; need not be preserved
    617 ;       pop     ebx             ; unused
    618         pop     ebp
    619         ret
    620 
    621 ; --------------------------------------------------------------------------
    622 ;
    623 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    624 ; It's still a box filter.
    625 ;
    626 ; GLOBAL(void)
    627 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
    628 ;                           JDIMENSION output_width,
    629 ;                           JSAMPARRAY input_data,
    630 ;                           JSAMPARRAY *output_data_ptr);
    631 ;
    632 
    633 %define max_v_samp(b)           (b)+8           ; int max_v_samp_factor
    634 %define output_width(b)         (b)+12          ; JDIMENSION output_width
    635 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
    636 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY *output_data_ptr
    637 
    638         align   16
    639         global  EXTN(jsimd_h2v2_upsample_sse2)
    640 
    641 EXTN(jsimd_h2v2_upsample_sse2):
    642         push    ebp
    643         mov     ebp,esp
    644         push    ebx
    645 ;       push    ecx             ; need not be preserved
    646 ;       push    edx             ; need not be preserved
    647         push    esi
    648         push    edi
    649 
    650         mov     edx, JDIMENSION [output_width(ebp)]
    651         add     edx, byte (2*SIZEOF_XMMWORD)-1
    652         and     edx, byte -(2*SIZEOF_XMMWORD)
    653         jz      near .return
    654 
    655         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
    656         test    ecx,ecx
    657         jz      near .return
    658 
    659         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
    660         mov     edi, POINTER [output_data_ptr(ebp)]
    661         mov     edi, JSAMPARRAY [edi]                   ; output_data
    662         alignx  16,7
    663 .rowloop:
    664         push    edi
    665         push    esi
    666 
    667         mov     esi, JSAMPROW [esi]                     ; inptr
    668         mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
    669         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
    670         mov     eax,edx                                 ; colctr
    671         alignx  16,7
    672 .columnloop:
    673 
    674         movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
    675 
    676         movdqa    xmm1,xmm0
    677         punpcklbw xmm0,xmm0
    678         punpckhbw xmm1,xmm1
    679 
    680         movdqa  XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
    681         movdqa  XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
    682         movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
    683         movdqa  XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
    684 
    685         sub     eax, byte 2*SIZEOF_XMMWORD
    686         jz      short .nextrow
    687 
    688         movdqa  xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
    689 
    690         movdqa    xmm3,xmm2
    691         punpcklbw xmm2,xmm2
    692         punpckhbw xmm3,xmm3
    693 
    694         movdqa  XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
    695         movdqa  XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
    696         movdqa  XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
    697         movdqa  XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
    698 
    699         sub     eax, byte 2*SIZEOF_XMMWORD
    700         jz      short .nextrow
    701 
    702         add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
    703         add     ebx, byte 4*SIZEOF_XMMWORD      ; outptr0
    704         add     edi, byte 4*SIZEOF_XMMWORD      ; outptr1
    705         jmp     short .columnloop
    706         alignx  16,7
    707 
    708 .nextrow:
    709         pop     esi
    710         pop     edi
    711 
    712         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
    713         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
    714         sub     ecx, byte 2                     ; rowctr
    715         jg      short .rowloop
    716 
    717 .return:
    718         pop     edi
    719         pop     esi
    720 ;       pop     edx             ; need not be preserved
    721 ;       pop     ecx             ; need not be preserved
    722         pop     ebx
    723         pop     ebp
    724         ret
    725 
    726 ; For some reason, the OS X linker does not honor the request to align the
    727 ; segment unless we do this.
    728         align   16
    729