Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsample.asm - upsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_CONST
     23 
     24         alignz  16
     25         global  EXTN(jconst_fancy_upsample_sse2)
     26 
     27 EXTN(jconst_fancy_upsample_sse2):
     28 
     29 PW_ONE          times 8 dw  1
     30 PW_TWO          times 8 dw  2
     31 PW_THREE        times 8 dw  3
     32 PW_SEVEN        times 8 dw  7
     33 PW_EIGHT        times 8 dw  8
     34 
     35         alignz  16
     36 
     37 ; --------------------------------------------------------------------------
     38         SECTION SEG_TEXT
     39         BITS    64
     40 ;
     41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     42 ;
     43 ; The upsampling algorithm is linear interpolation between pixel centers,
     44 ; also known as a "triangle filter".  This is a good compromise between
     45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     46 ; of the way between input pixel centers.
     47 ;
     48 ; GLOBAL(void)
     49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
     50 ;                                 JDIMENSION downsampled_width,
     51 ;                                 JSAMPARRAY input_data,
     52 ;                                 JSAMPARRAY *output_data_ptr);
     53 ;
     54 
     55 ; r10 = int max_v_samp_factor
     56 ; r11 = JDIMENSION downsampled_width
     57 ; r12 = JSAMPARRAY input_data
     58 ; r13 = JSAMPARRAY *output_data_ptr
     59 
     60         align   16
     61         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
     62 
     63 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     64         push    rbp
     65         mov     rax,rsp
     66         mov     rbp,rsp
     67         collect_args
     68 
     69         mov     eax, r11d  ; colctr
     70         test    rax,rax
     71         jz      near .return
     72 
     73         mov     rcx, r10        ; rowctr
     74         test    rcx,rcx
     75         jz      near .return
     76 
     77         mov     rsi, r12        ; input_data
     78         mov     rdi, r13
     79         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
     80 .rowloop:
     81         push    rax                     ; colctr
     82         push    rdi
     83         push    rsi
     84 
     85         mov     rsi, JSAMPROW [rsi]     ; inptr
     86         mov     rdi, JSAMPROW [rdi]     ; outptr
     87 
     88         test    rax, SIZEOF_XMMWORD-1
     89         jz      short .skip
     90         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
     91         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     92 .skip:
     93         pxor    xmm0,xmm0               ; xmm0=(all 0's)
     94         pcmpeqb xmm7,xmm7
     95         psrldq  xmm7,(SIZEOF_XMMWORD-1)
     96         pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
     97 
     98         add     rax, byte SIZEOF_XMMWORD-1
     99         and     rax, byte -SIZEOF_XMMWORD
    100         cmp     rax, byte SIZEOF_XMMWORD
    101         ja      short .columnloop
    102 
    103 .columnloop_last:
    104         pcmpeqb xmm6,xmm6
    105         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    106         pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    107         jmp     short .upsample
    108 
    109 .columnloop:
    110         movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    111         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    112 
    113 .upsample:
    114         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    115         movdqa  xmm2,xmm1
    116         movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
    117         pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
    118         psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
    119 
    120         por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
    121         por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
    122 
    123         movdqa  xmm7,xmm1
    124         psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
    125 
    126         movdqa    xmm4,xmm1
    127         punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
    128         punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
    129         movdqa    xmm5,xmm2
    130         punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
    131         punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
    132         movdqa    xmm6,xmm3
    133         punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
    134         punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
    135 
    136         pmullw  xmm1,[rel PW_THREE]
    137         pmullw  xmm4,[rel PW_THREE]
    138         paddw   xmm2,[rel PW_ONE]
    139         paddw   xmm5,[rel PW_ONE]
    140         paddw   xmm3,[rel PW_TWO]
    141         paddw   xmm6,[rel PW_TWO]
    142 
    143         paddw   xmm2,xmm1
    144         paddw   xmm5,xmm4
    145         psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    146         psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    147         paddw   xmm3,xmm1
    148         paddw   xmm6,xmm4
    149         psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    150         psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    151 
    152         psllw   xmm3,BYTE_BIT
    153         psllw   xmm6,BYTE_BIT
    154         por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
    155         por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
    156 
    157         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    158         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
    159 
    160         sub     rax, byte SIZEOF_XMMWORD
    161         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
    162         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
    163         cmp     rax, byte SIZEOF_XMMWORD
    164         ja      near .columnloop
    165         test    eax,eax
    166         jnz     near .columnloop_last
    167 
    168         pop     rsi
    169         pop     rdi
    170         pop     rax
    171 
    172         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    173         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    174         dec     rcx                             ; rowctr
    175         jg      near .rowloop
    176 
    177 .return:
    178         uncollect_args
    179         pop     rbp
    180         ret
    181 
    182 ; --------------------------------------------------------------------------
    183 ;
    184 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    185 ; Again a triangle filter; see comments for h2v1 case, above.
    186 ;
    187 ; GLOBAL(void)
    188 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
    189 ;                                 JDIMENSION downsampled_width,
    190 ;                                 JSAMPARRAY input_data,
    191 ;                                 JSAMPARRAY *output_data_ptr);
    192 ;
    193 
    194 ; r10 = int max_v_samp_factor
    195 ; r11 = JDIMENSION downsampled_width
    196 ; r12 = JSAMPARRAY input_data
    197 ; r13 = JSAMPARRAY *output_data_ptr
    198 
    199 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
    200 %define WK_NUM          4
    201 
    202         align   16
    203         global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
    204 
    205 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    206         push    rbp
    207         mov     rax,rsp                         ; rax = original rbp
    208         sub     rsp, byte 4
    209         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
    210         mov     [rsp],rax
    211         mov     rbp,rsp                         ; rbp = aligned rbp
    212         lea     rsp, [wk(0)]
    213         collect_args
    214         push    rbx
    215 
    216         mov     eax, r11d  ; colctr
    217         test    rax,rax
    218         jz      near .return
    219 
    220         mov     rcx, r10        ; rowctr
    221         test    rcx,rcx
    222         jz      near .return
    223 
    224         mov     rsi, r12        ; input_data
    225         mov     rdi, r13
    226         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    227 .rowloop:
    228         push    rax                                     ; colctr
    229         push    rcx
    230         push    rdi
    231         push    rsi
    232 
    233         mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
    234         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
    235         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
    236         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
    237         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
    238 
    239         test    rax, SIZEOF_XMMWORD-1
    240         jz      short .skip
    241         push    rdx
    242         mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
    243         mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
    244         mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
    245         mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
    246         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    247         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    248         pop     rdx
    249 .skip:
    250         ; -- process the first column block
    251 
    252         movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
    253         movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
    254         movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
    255 
    256         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    257         movdqa    xmm4,xmm0
    258         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    259         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    260         movdqa    xmm5,xmm1
    261         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    262         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    263         movdqa    xmm6,xmm2
    264         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    265         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    266 
    267         pmullw  xmm0,[rel PW_THREE]
    268         pmullw  xmm4,[rel PW_THREE]
    269 
    270         pcmpeqb xmm7,xmm7
    271         psrldq  xmm7,(SIZEOF_XMMWORD-2)
    272 
    273         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    274         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    275         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    276         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    277 
    278         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
    279         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    280         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    281         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
    282 
    283         pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
    284         pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
    285 
    286         movdqa  XMMWORD [wk(0)], xmm1
    287         movdqa  XMMWORD [wk(1)], xmm2
    288 
    289         add     rax, byte SIZEOF_XMMWORD-1
    290         and     rax, byte -SIZEOF_XMMWORD
    291         cmp     rax, byte SIZEOF_XMMWORD
    292         ja      short .columnloop
    293 
    294 .columnloop_last:
    295         ; -- process the last column block
    296 
    297         pcmpeqb xmm1,xmm1
    298         pslldq  xmm1,(SIZEOF_XMMWORD-2)
    299         movdqa  xmm2,xmm1
    300 
    301         pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    302         pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    303 
    304         movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
    305         movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
    306 
    307         jmp     near .upsample
    308 
    309 .columnloop:
    310         ; -- process the next column block
    311 
    312         movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
    313         movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
    314         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
    315 
    316         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    317         movdqa    xmm4,xmm0
    318         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    319         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    320         movdqa    xmm5,xmm1
    321         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    322         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    323         movdqa    xmm6,xmm2
    324         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    325         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    326 
    327         pmullw  xmm0,[rel PW_THREE]
    328         pmullw  xmm4,[rel PW_THREE]
    329 
    330         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    331         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    332         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    333         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    334 
    335         movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
    336         movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    337         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    338         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
    339 
    340         pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
    341         pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
    342 
    343         movdqa  XMMWORD [wk(2)], xmm1
    344         movdqa  XMMWORD [wk(3)], xmm2
    345 
    346 .upsample:
    347         ; -- process the upper row
    348 
    349         movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    350         movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    351 
    352         movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    353         movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    354         psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
    355         pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
    356         movdqa  xmm5,xmm7
    357         movdqa  xmm6,xmm3
    358         psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
    359         pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
    360 
    361         por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
    362         por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
    363 
    364         movdqa  xmm1,xmm7
    365         movdqa  xmm2,xmm3
    366         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    367         psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
    368         movdqa  xmm4,xmm3
    369         psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
    370 
    371         por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    372         por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
    373 
    374         movdqa  XMMWORD [wk(0)], xmm4
    375 
    376         pmullw  xmm7,[rel PW_THREE]
    377         pmullw  xmm3,[rel PW_THREE]
    378         paddw   xmm1,[rel PW_EIGHT]
    379         paddw   xmm5,[rel PW_EIGHT]
    380         paddw   xmm0,[rel PW_SEVEN]
    381         paddw   xmm2,[rel PW_SEVEN]
    382 
    383         paddw   xmm1,xmm7
    384         paddw   xmm5,xmm3
    385         psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    386         psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    387         paddw   xmm0,xmm7
    388         paddw   xmm2,xmm3
    389         psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    390         psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    391 
    392         psllw   xmm0,BYTE_BIT
    393         psllw   xmm2,BYTE_BIT
    394         por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    395         por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
    396 
    397         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
    398         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
    399 
    400         ; -- process the lower row
    401 
    402         movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
    403         movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    404 
    405         movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    406         movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    407         psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
    408         pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
    409         movdqa  xmm0,xmm6
    410         movdqa  xmm2,xmm4
    411         psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
    412         pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
    413 
    414         por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
    415         por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
    416 
    417         movdqa  xmm1,xmm6
    418         movdqa  xmm5,xmm4
    419         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    420         psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
    421         movdqa  xmm3,xmm4
    422         psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
    423 
    424         por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    425         por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
    426 
    427         movdqa  XMMWORD [wk(1)], xmm3
    428 
    429         pmullw  xmm6,[rel PW_THREE]
    430         pmullw  xmm4,[rel PW_THREE]
    431         paddw   xmm1,[rel PW_EIGHT]
    432         paddw   xmm0,[rel PW_EIGHT]
    433         paddw   xmm7,[rel PW_SEVEN]
    434         paddw   xmm5,[rel PW_SEVEN]
    435 
    436         paddw   xmm1,xmm6
    437         paddw   xmm0,xmm4
    438         psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    439         psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    440         paddw   xmm7,xmm6
    441         paddw   xmm5,xmm4
    442         psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    443         psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    444 
    445         psllw   xmm7,BYTE_BIT
    446         psllw   xmm5,BYTE_BIT
    447         por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    448         por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
    449 
    450         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
    451         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
    452 
    453         sub     rax, byte SIZEOF_XMMWORD
    454         add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
    455         add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
    456         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
    457         add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
    458         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
    459         cmp     rax, byte SIZEOF_XMMWORD
    460         ja      near .columnloop
    461         test    rax,rax
    462         jnz     near .columnloop_last
    463 
    464         pop     rsi
    465         pop     rdi
    466         pop     rcx
    467         pop     rax
    468 
    469         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
    470         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
    471         sub     rcx, byte 2                     ; rowctr
    472         jg      near .rowloop
    473 
    474 .return:
    475         pop     rbx
    476         uncollect_args
    477         mov     rsp,rbp         ; rsp <- aligned rbp
    478         pop     rsp             ; rsp <- original rbp
    479         pop     rbp
    480         ret
    481 
    482 ; --------------------------------------------------------------------------
    483 ;
    484 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    485 ; It's still a box filter.
    486 ;
    487 ; GLOBAL(void)
    488 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
    489 ;                           JDIMENSION output_width,
    490 ;                           JSAMPARRAY input_data,
    491 ;                           JSAMPARRAY *output_data_ptr);
    492 ;
    493 
    494 ; r10 = int max_v_samp_factor
    495 ; r11 = JDIMENSION output_width
    496 ; r12 = JSAMPARRAY input_data
    497 ; r13 = JSAMPARRAY *output_data_ptr
    498 
    499         align   16
    500         global  EXTN(jsimd_h2v1_upsample_sse2)
    501 
    502 EXTN(jsimd_h2v1_upsample_sse2):
    503         push    rbp
    504         mov     rax,rsp
    505         mov     rbp,rsp
    506         collect_args
    507 
    508         mov     edx, r11d
    509         add     rdx, byte (2*SIZEOF_XMMWORD)-1
    510         and     rdx, byte -(2*SIZEOF_XMMWORD)
    511         jz      near .return
    512 
    513         mov     rcx, r10        ; rowctr
    514         test    rcx,rcx
    515         jz      short .return
    516 
    517         mov     rsi, r12 ; input_data
    518         mov     rdi, r13
    519         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    520 .rowloop:
    521         push    rdi
    522         push    rsi
    523 
    524         mov     rsi, JSAMPROW [rsi]             ; inptr
    525         mov     rdi, JSAMPROW [rdi]             ; outptr
    526         mov     rax,rdx                         ; colctr
    527 .columnloop:
    528 
    529         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    530 
    531         movdqa    xmm1,xmm0
    532         punpcklbw xmm0,xmm0
    533         punpckhbw xmm1,xmm1
    534 
    535         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    536         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    537 
    538         sub     rax, byte 2*SIZEOF_XMMWORD
    539         jz      short .nextrow
    540 
    541         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    542 
    543         movdqa    xmm3,xmm2
    544         punpcklbw xmm2,xmm2
    545         punpckhbw xmm3,xmm3
    546 
    547         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    548         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    549 
    550         sub     rax, byte 2*SIZEOF_XMMWORD
    551         jz      short .nextrow
    552 
    553         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    554         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
    555         jmp     short .columnloop
    556 
    557 .nextrow:
    558         pop     rsi
    559         pop     rdi
    560 
    561         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    562         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    563         dec     rcx                             ; rowctr
    564         jg      short .rowloop
    565 
    566 .return:
    567         uncollect_args
    568         pop     rbp
    569         ret
    570 
    571 ; --------------------------------------------------------------------------
    572 ;
    573 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    574 ; It's still a box filter.
    575 ;
    576 ; GLOBAL(void)
    577 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
    578 ;                           JDIMENSION output_width,
    579 ;                           JSAMPARRAY input_data,
    580 ;                           JSAMPARRAY *output_data_ptr);
    581 ;
    582 
    583 ; r10 = int max_v_samp_factor
    584 ; r11 = JDIMENSION output_width
    585 ; r12 = JSAMPARRAY input_data
    586 ; r13 = JSAMPARRAY *output_data_ptr
    587 
    588         align   16
    589         global  EXTN(jsimd_h2v2_upsample_sse2)
    590 
    591 EXTN(jsimd_h2v2_upsample_sse2):
    592         push    rbp
    593         mov     rax,rsp
    594         mov     rbp,rsp
    595         collect_args
    596         push    rbx
    597 
    598         mov     edx, r11d
    599         add     rdx, byte (2*SIZEOF_XMMWORD)-1
    600         and     rdx, byte -(2*SIZEOF_XMMWORD)
    601         jz      near .return
    602 
    603         mov     rcx, r10        ; rowctr
    604         test    rcx,rcx
    605         jz      near .return
    606 
    607         mov     rsi, r12        ; input_data
    608         mov     rdi, r13
    609         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    610 .rowloop:
    611         push    rdi
    612         push    rsi
    613 
    614         mov     rsi, JSAMPROW [rsi]                     ; inptr
    615         mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
    616         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
    617         mov     rax,rdx                                 ; colctr
    618 .columnloop:
    619 
    620         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    621 
    622         movdqa    xmm1,xmm0
    623         punpcklbw xmm0,xmm0
    624         punpckhbw xmm1,xmm1
    625 
    626         movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
    627         movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
    628         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    629         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    630 
    631         sub     rax, byte 2*SIZEOF_XMMWORD
    632         jz      short .nextrow
    633 
    634         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    635 
    636         movdqa    xmm3,xmm2
    637         punpcklbw xmm2,xmm2
    638         punpckhbw xmm3,xmm3
    639 
    640         movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
    641         movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
    642         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    643         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    644 
    645         sub     rax, byte 2*SIZEOF_XMMWORD
    646         jz      short .nextrow
    647 
    648         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    649         add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
    650         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
    651         jmp     short .columnloop
    652 
    653 .nextrow:
    654         pop     rsi
    655         pop     rdi
    656 
    657         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
    658         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
    659         sub     rcx, byte 2                     ; rowctr
    660         jg      near .rowloop
    661 
    662 .return:
    663         pop     rbx
    664         uncollect_args
    665         pop     rbp
    666         ret
    667 
    668 ; For some reason, the OS X linker does not honor the request to align the
    669 ; segment unless we do this.
    670         align   16
    671