Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jdsample.asm - upsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23         SECTION SEG_CONST
     24 
     25         alignz  16
     26         global  EXTN(jconst_fancy_upsample_sse2)
     27 
     28 EXTN(jconst_fancy_upsample_sse2):
     29 
     30 PW_ONE          times 8 dw  1
     31 PW_TWO          times 8 dw  2
     32 PW_THREE        times 8 dw  3
     33 PW_SEVEN        times 8 dw  7
     34 PW_EIGHT        times 8 dw  8
     35 
     36         alignz  16
     37 
     38 ; --------------------------------------------------------------------------
     39         SECTION SEG_TEXT
     40         BITS    64
     41 ;
     42 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
     43 ;
     44 ; The upsampling algorithm is linear interpolation between pixel centers,
     45 ; also known as a "triangle filter".  This is a good compromise between
     46 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
     47 ; of the way between input pixel centers.
     48 ;
     49 ; GLOBAL(void)
     50 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
     51 ;                                 JDIMENSION downsampled_width,
     52 ;                                 JSAMPARRAY input_data,
     53 ;                                 JSAMPARRAY * output_data_ptr);
     54 ;
     55 
     56 ; r10 = int max_v_samp_factor
     57 ; r11 = JDIMENSION downsampled_width
     58 ; r12 = JSAMPARRAY input_data
     59 ; r13 = JSAMPARRAY * output_data_ptr
     60 
     61         align   16
     62         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
     63 
     64 EXTN(jsimd_h2v1_fancy_upsample_sse2):
     65         push    rbp
     66         mov     rax,rsp
     67         mov     rbp,rsp
     68         collect_args
     69 
     70         mov     eax, r11d  ; colctr
     71         test    rax,rax
     72         jz      near .return
     73 
     74         mov     rcx, r10        ; rowctr
     75         test    rcx,rcx
     76         jz      near .return
     77 
     78         mov     rsi, r12        ; input_data
     79         mov     rdi, r13
     80         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
     81 .rowloop:
     82         push    rax                     ; colctr
     83         push    rdi
     84         push    rsi
     85 
     86         mov     rsi, JSAMPROW [rsi]     ; inptr
     87         mov     rdi, JSAMPROW [rdi]     ; outptr
     88 
     89         test    rax, SIZEOF_XMMWORD-1
     90         jz      short .skip
     91         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
     92         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
     93 .skip:
     94         pxor    xmm0,xmm0               ; xmm0=(all 0's)
     95         pcmpeqb xmm7,xmm7
     96         psrldq  xmm7,(SIZEOF_XMMWORD-1)
     97         pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
     98 
     99         add     rax, byte SIZEOF_XMMWORD-1
    100         and     rax, byte -SIZEOF_XMMWORD
    101         cmp     rax, byte SIZEOF_XMMWORD
    102         ja      short .columnloop
    103 
    104 .columnloop_last:
    105         pcmpeqb xmm6,xmm6
    106         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    107         pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    108         jmp     short .upsample
    109 
    110 .columnloop:
    111         movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    112         pslldq  xmm6,(SIZEOF_XMMWORD-1)
    113 
    114 .upsample:
    115         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    116         movdqa  xmm2,xmm1
    117         movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
    118         pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
    119         psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
    120 
    121         por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
    122         por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
    123 
    124         movdqa  xmm7,xmm1
    125         psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
    126 
    127         movdqa    xmm4,xmm1
    128         punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
    129         punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
    130         movdqa    xmm5,xmm2
    131         punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
    132         punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
    133         movdqa    xmm6,xmm3
    134         punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
    135         punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
    136 
    137         pmullw  xmm1,[rel PW_THREE]
    138         pmullw  xmm4,[rel PW_THREE]
    139         paddw   xmm2,[rel PW_ONE]
    140         paddw   xmm5,[rel PW_ONE]
    141         paddw   xmm3,[rel PW_TWO]
    142         paddw   xmm6,[rel PW_TWO]
    143 
    144         paddw   xmm2,xmm1
    145         paddw   xmm5,xmm4
    146         psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
    147         psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
    148         paddw   xmm3,xmm1
    149         paddw   xmm6,xmm4
    150         psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
    151         psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
    152 
    153         psllw   xmm3,BYTE_BIT
    154         psllw   xmm6,BYTE_BIT
    155         por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
    156         por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
    157 
    158         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    159         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
    160 
    161         sub     rax, byte SIZEOF_XMMWORD
    162         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
    163         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
    164         cmp     rax, byte SIZEOF_XMMWORD
    165         ja      near .columnloop
    166         test    eax,eax
    167         jnz     near .columnloop_last
    168 
    169         pop     rsi
    170         pop     rdi
    171         pop     rax
    172 
    173         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    174         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    175         dec     rcx                             ; rowctr
    176         jg      near .rowloop
    177 
    178 .return:
    179         uncollect_args
    180         pop     rbp
    181         ret
    182 
    183 ; --------------------------------------------------------------------------
    184 ;
    185 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
    186 ; Again a triangle filter; see comments for h2v1 case, above.
    187 ;
    188 ; GLOBAL(void)
    189 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
    190 ;                                 JDIMENSION downsampled_width,
    191 ;                                 JSAMPARRAY input_data,
    192 ;                                 JSAMPARRAY * output_data_ptr);
    193 ;
    194 
    195 ; r10 = int max_v_samp_factor
    196 ; r11 = JDIMENSION downsampled_width
    197 ; r12 = JSAMPARRAY input_data
    198 ; r13 = JSAMPARRAY * output_data_ptr
    199 
    200 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
    201 %define WK_NUM          4
    202 
    203         align   16
    204         global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
    205 
    206 EXTN(jsimd_h2v2_fancy_upsample_sse2):
    207         push    rbp
    208         mov     rax,rsp                         ; rax = original rbp
    209         sub     rsp, byte 4
    210         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
    211         mov     [rsp],rax
    212         mov     rbp,rsp                         ; rbp = aligned rbp
    213         lea     rsp, [wk(0)]
    214         collect_args
    215         push    rbx
    216 
    217         mov     eax, r11d  ; colctr
    218         test    rax,rax
    219         jz      near .return
    220 
    221         mov     rcx, r10        ; rowctr
    222         test    rcx,rcx
    223         jz      near .return
    224 
    225         mov     rsi, r12        ; input_data
    226         mov     rdi, r13
    227         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    228 .rowloop:
    229         push    rax                                     ; colctr
    230         push    rcx
    231         push    rdi
    232         push    rsi
    233 
    234         mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
    235         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
    236         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
    237         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
    238         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
    239 
    240         test    rax, SIZEOF_XMMWORD-1
    241         jz      short .skip
    242         push    rdx
    243         mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
    244         mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
    245         mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
    246         mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
    247         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    248         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
    249         pop     rdx
    250 .skip:
    251         ; -- process the first column block
    252 
    253         movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
    254         movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
    255         movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
    256 
    257         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    258         movdqa    xmm4,xmm0
    259         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    260         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    261         movdqa    xmm5,xmm1
    262         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    263         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    264         movdqa    xmm6,xmm2
    265         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    266         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    267 
    268         pmullw  xmm0,[rel PW_THREE]
    269         pmullw  xmm4,[rel PW_THREE]
    270 
    271         pcmpeqb xmm7,xmm7
    272         psrldq  xmm7,(SIZEOF_XMMWORD-2)
    273 
    274         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    275         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    276         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    277         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    278 
    279         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
    280         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    281         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
    282         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
    283 
    284         pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
    285         pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
    286 
    287         movdqa  XMMWORD [wk(0)], xmm1
    288         movdqa  XMMWORD [wk(1)], xmm2
    289 
    290         add     rax, byte SIZEOF_XMMWORD-1
    291         and     rax, byte -SIZEOF_XMMWORD
    292         cmp     rax, byte SIZEOF_XMMWORD
    293         ja      short .columnloop
    294 
    295 .columnloop_last:
    296         ; -- process the last column block
    297 
    298         pcmpeqb xmm1,xmm1
    299         pslldq  xmm1,(SIZEOF_XMMWORD-2)
    300         movdqa  xmm2,xmm1
    301 
    302         pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    303         pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    304 
    305         movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
    306         movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
    307 
    308         jmp     near .upsample
    309 
    310 .columnloop:
    311         ; -- process the next column block
    312 
    313         movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
    314         movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
    315         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
    316 
    317         pxor      xmm3,xmm3             ; xmm3=(all 0's)
    318         movdqa    xmm4,xmm0
    319         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
    320         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
    321         movdqa    xmm5,xmm1
    322         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
    323         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
    324         movdqa    xmm6,xmm2
    325         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
    326         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
    327 
    328         pmullw  xmm0,[rel PW_THREE]
    329         pmullw  xmm4,[rel PW_THREE]
    330 
    331         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
    332         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
    333         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
    334         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
    335 
    336         movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
    337         movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
    338         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    339         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
    340 
    341         pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
    342         pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
    343 
    344         movdqa  XMMWORD [wk(2)], xmm1
    345         movdqa  XMMWORD [wk(3)], xmm2
    346 
    347 .upsample:
    348         ; -- process the upper row
    349 
    350         movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    351         movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    352 
    353         movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
    354         movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
    355         psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
    356         pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
    357         movdqa  xmm5,xmm7
    358         movdqa  xmm6,xmm3
    359         psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
    360         pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
    361 
    362         por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
    363         por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
    364 
    365         movdqa  xmm1,xmm7
    366         movdqa  xmm2,xmm3
    367         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    368         psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
    369         movdqa  xmm4,xmm3
    370         psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
    371 
    372         por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    373         por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
    374 
    375         movdqa  XMMWORD [wk(0)], xmm4
    376 
    377         pmullw  xmm7,[rel PW_THREE]
    378         pmullw  xmm3,[rel PW_THREE]
    379         paddw   xmm1,[rel PW_EIGHT]
    380         paddw   xmm5,[rel PW_EIGHT]
    381         paddw   xmm0,[rel PW_SEVEN]
    382         paddw   xmm2,[rel PW_SEVEN]
    383 
    384         paddw   xmm1,xmm7
    385         paddw   xmm5,xmm3
    386         psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
    387         psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
    388         paddw   xmm0,xmm7
    389         paddw   xmm2,xmm3
    390         psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
    391         psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
    392 
    393         psllw   xmm0,BYTE_BIT
    394         psllw   xmm2,BYTE_BIT
    395         por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
    396         por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
    397 
    398         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
    399         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
    400 
    401         ; -- process the lower row
    402 
    403         movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
    404         movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
    405 
    406         movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
    407         movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
    408         psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
    409         pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
    410         movdqa  xmm0,xmm6
    411         movdqa  xmm2,xmm4
    412         psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
    413         pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
    414 
    415         por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
    416         por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
    417 
    418         movdqa  xmm1,xmm6
    419         movdqa  xmm5,xmm4
    420         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
    421         psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
    422         movdqa  xmm3,xmm4
    423         psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
    424 
    425         por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
    426         por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
    427 
    428         movdqa  XMMWORD [wk(1)], xmm3
    429 
    430         pmullw  xmm6,[rel PW_THREE]
    431         pmullw  xmm4,[rel PW_THREE]
    432         paddw   xmm1,[rel PW_EIGHT]
    433         paddw   xmm0,[rel PW_EIGHT]
    434         paddw   xmm7,[rel PW_SEVEN]
    435         paddw   xmm5,[rel PW_SEVEN]
    436 
    437         paddw   xmm1,xmm6
    438         paddw   xmm0,xmm4
    439         psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
    440         psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
    441         paddw   xmm7,xmm6
    442         paddw   xmm5,xmm4
    443         psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
    444         psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
    445 
    446         psllw   xmm7,BYTE_BIT
    447         psllw   xmm5,BYTE_BIT
    448         por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
    449         por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
    450 
    451         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
    452         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
    453 
    454         sub     rax, byte SIZEOF_XMMWORD
    455         add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
    456         add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
    457         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
    458         add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
    459         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
    460         cmp     rax, byte SIZEOF_XMMWORD
    461         ja      near .columnloop
    462         test    rax,rax
    463         jnz     near .columnloop_last
    464 
    465         pop     rsi
    466         pop     rdi
    467         pop     rcx
    468         pop     rax
    469 
    470         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
    471         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
    472         sub     rcx, byte 2                     ; rowctr
    473         jg      near .rowloop
    474 
    475 .return:
    476         pop     rbx
    477         uncollect_args
    478         mov     rsp,rbp         ; rsp <- aligned rbp
    479         pop     rsp             ; rsp <- original rbp
    480         pop     rbp
    481         ret
    482 
    483 ; --------------------------------------------------------------------------
    484 ;
    485 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
    486 ; It's still a box filter.
    487 ;
    488 ; GLOBAL(void)
    489 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
    490 ;                           JDIMENSION output_width,
    491 ;                           JSAMPARRAY input_data,
    492 ;                           JSAMPARRAY * output_data_ptr);
    493 ;
    494 
    495 ; r10 = int max_v_samp_factor
    496 ; r11 = JDIMENSION output_width
    497 ; r12 = JSAMPARRAY input_data
    498 ; r13 = JSAMPARRAY * output_data_ptr
    499 
    500         align   16
    501         global  EXTN(jsimd_h2v1_upsample_sse2)
    502 
    503 EXTN(jsimd_h2v1_upsample_sse2):
    504         push    rbp
    505         mov     rax,rsp
    506         mov     rbp,rsp
    507         collect_args
    508 
    509         mov     edx, r11d
    510         add     rdx, byte (2*SIZEOF_XMMWORD)-1
    511         and     rdx, byte -(2*SIZEOF_XMMWORD)
    512         jz      near .return
    513 
    514         mov     rcx, r10        ; rowctr
    515         test    rcx,rcx
    516         jz      short .return
    517 
    518         mov     rsi, r12 ; input_data
    519         mov     rdi, r13
    520         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    521 .rowloop:
    522         push    rdi
    523         push    rsi
    524 
    525         mov     rsi, JSAMPROW [rsi]             ; inptr
    526         mov     rdi, JSAMPROW [rdi]             ; outptr
    527         mov     rax,rdx                         ; colctr
    528 .columnloop:
    529 
    530         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    531 
    532         movdqa    xmm1,xmm0
    533         punpcklbw xmm0,xmm0
    534         punpckhbw xmm1,xmm1
    535 
    536         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    537         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    538 
    539         sub     rax, byte 2*SIZEOF_XMMWORD
    540         jz      short .nextrow
    541 
    542         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    543 
    544         movdqa    xmm3,xmm2
    545         punpcklbw xmm2,xmm2
    546         punpckhbw xmm3,xmm3
    547 
    548         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    549         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    550 
    551         sub     rax, byte 2*SIZEOF_XMMWORD
    552         jz      short .nextrow
    553 
    554         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    555         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
    556         jmp     short .columnloop
    557 
    558 .nextrow:
    559         pop     rsi
    560         pop     rdi
    561 
    562         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    563         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    564         dec     rcx                             ; rowctr
    565         jg      short .rowloop
    566 
    567 .return:
    568         uncollect_args
    569         pop     rbp
    570         ret
    571 
    572 ; --------------------------------------------------------------------------
    573 ;
    574 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
    575 ; It's still a box filter.
    576 ;
    577 ; GLOBAL(void)
    578 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
    579 ;                           JDIMENSION output_width,
    580 ;                           JSAMPARRAY input_data,
    581 ;                           JSAMPARRAY * output_data_ptr);
    582 ;
    583 
    584 ; r10 = int max_v_samp_factor
    585 ; r11 = JDIMENSION output_width
    586 ; r12 = JSAMPARRAY input_data
    587 ; r13 = JSAMPARRAY * output_data_ptr
    588 
    589         align   16
    590         global  EXTN(jsimd_h2v2_upsample_sse2)
    591 
    592 EXTN(jsimd_h2v2_upsample_sse2):
    593         push    rbp
    594         mov     rax,rsp
    595         mov     rbp,rsp
    596         collect_args
    597         push    rbx
    598 
    599         mov     edx, r11d
    600         add     rdx, byte (2*SIZEOF_XMMWORD)-1
    601         and     rdx, byte -(2*SIZEOF_XMMWORD)
    602         jz      near .return
    603 
    604         mov     rcx, r10        ; rowctr
    605         test    rcx,rcx
    606         jz      near .return
    607 
    608         mov     rsi, r12        ; input_data
    609         mov     rdi, r13
    610         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
    611 .rowloop:
    612         push    rdi
    613         push    rsi
    614 
    615         mov     rsi, JSAMPROW [rsi]                     ; inptr
    616         mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
    617         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
    618         mov     rax,rdx                                 ; colctr
    619 .columnloop:
    620 
    621         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    622 
    623         movdqa    xmm1,xmm0
    624         punpcklbw xmm0,xmm0
    625         punpckhbw xmm1,xmm1
    626 
    627         movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
    628         movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
    629         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    630         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
    631 
    632         sub     rax, byte 2*SIZEOF_XMMWORD
    633         jz      short .nextrow
    634 
    635         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    636 
    637         movdqa    xmm3,xmm2
    638         punpcklbw xmm2,xmm2
    639         punpckhbw xmm3,xmm3
    640 
    641         movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
    642         movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
    643         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
    644         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
    645 
    646         sub     rax, byte 2*SIZEOF_XMMWORD
    647         jz      short .nextrow
    648 
    649         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    650         add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
    651         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
    652         jmp     short .columnloop
    653 
    654 .nextrow:
    655         pop     rsi
    656         pop     rdi
    657 
    658         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
    659         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
    660         sub     rcx, byte 2                     ; rowctr
    661         jg      near .rowloop
    662 
    663 .return:
    664         pop     rbx
    665         uncollect_args
    666         pop     rbp
    667         ret
    668 
    669 ; For some reason, the OS X linker does not honor the request to align the
    670 ; segment unless we do this.
    671         align   16
    672