Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright 2009 D. R. Commander
      6 ;
      7 ; Based on
      8 ; x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23         SECTION SEG_TEXT
     24         BITS    64
     25 ;
     26 ; Downsample pixel values of a single component.
     27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     28 ; without smoothing.
     29 ;
     30 ; GLOBAL(void)
     31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     32 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     33 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     34 ;
     35 
     36 ; r10 = JDIMENSION image_width
     37 ; r11 = int max_v_samp_factor
     38 ; r12 = JDIMENSION v_samp_factor
     39 ; r13 = JDIMENSION width_blocks
     40 ; r14 = JSAMPARRAY input_data
     41 ; r15 = JSAMPARRAY output_data
     42 
     43         align   16
     44         global  EXTN(jsimd_h2v1_downsample_sse2)
     45 
     46 EXTN(jsimd_h2v1_downsample_sse2):
     47         push    rbp
     48         mov     rax,rsp
     49         mov     rbp,rsp
     50         collect_args
     51 
     52         mov ecx, r13d
     53         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
     54         jz      near .return
     55 
     56         mov edx, r10d
     57 
     58         ; -- expand_right_edge
     59 
     60         push    rcx
     61         shl     rcx,1                           ; output_cols * 2
     62         sub     rcx,rdx
     63         jle     short .expand_end
     64 
     65         mov     rax, r11
     66         test    rax,rax
     67         jle     short .expand_end
     68 
     69         cld
     70         mov     rsi, r14        ; input_data
     71 .expandloop:
     72         push    rax
     73         push    rcx
     74 
     75         mov     rdi, JSAMPROW [rsi]
     76         add     rdi,rdx
     77         mov     al, JSAMPLE [rdi-1]
     78 
     79         rep stosb
     80 
     81         pop     rcx
     82         pop     rax
     83 
     84         add     rsi, byte SIZEOF_JSAMPROW
     85         dec     rax
     86         jg      short .expandloop
     87 
     88 .expand_end:
     89         pop     rcx                             ; output_cols
     90 
     91         ; -- h2v1_downsample
     92 
     93         mov     eax, r12d        ; rowctr
     94         test    eax,eax
     95         jle     near .return
     96 
     97         mov     rdx, 0x00010000         ; bias pattern
     98         movd    xmm7,edx
     99         pcmpeqw xmm6,xmm6
    100         pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    101         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    102 
    103         mov     rsi, r14        ; input_data
    104         mov     rdi, r15        ; output_data
    105 .rowloop:
    106         push    rcx
    107         push    rdi
    108         push    rsi
    109 
    110         mov     rsi, JSAMPROW [rsi]             ; inptr
    111         mov rdi, JSAMPROW [rdi]         ; outptr
    112 
    113         cmp     rcx, byte SIZEOF_XMMWORD
    114         jae     short .columnloop
    115 
    116 .columnloop_r8:
    117         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    118         pxor    xmm1,xmm1
    119         mov     rcx, SIZEOF_XMMWORD
    120         jmp     short .downsample
    121 
    122 .columnloop:
    123         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    124         movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    125 
    126 .downsample:
    127         movdqa  xmm2,xmm0
    128         movdqa  xmm3,xmm1
    129 
    130         pand    xmm0,xmm6
    131         psrlw   xmm2,BYTE_BIT
    132         pand    xmm1,xmm6
    133         psrlw   xmm3,BYTE_BIT
    134 
    135         paddw   xmm0,xmm2
    136         paddw   xmm1,xmm3
    137         paddw   xmm0,xmm7
    138         paddw   xmm1,xmm7
    139         psrlw   xmm0,1
    140         psrlw   xmm1,1
    141 
    142         packuswb xmm0,xmm1
    143 
    144         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    145 
    146         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
    147         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    148         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
    149         cmp     rcx, byte SIZEOF_XMMWORD
    150         jae     short .columnloop
    151         test    rcx,rcx
    152         jnz     short .columnloop_r8
    153 
    154         pop     rsi
    155         pop     rdi
    156         pop     rcx
    157 
    158         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    159         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    160         dec     rax                             ; rowctr
    161         jg      near .rowloop
    162 
    163 .return:
    164         uncollect_args
    165         pop     rbp
    166         ret
    167 
    168 ; --------------------------------------------------------------------------
    169 ;
    170 ; Downsample pixel values of a single component.
    171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    172 ; without smoothing.
    173 ;
    174 ; GLOBAL(void)
    175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    176 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    177 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    178 ;
    179 
    180 ; r10 = JDIMENSION image_width
    181 ; r11 = int max_v_samp_factor
    182 ; r12 = JDIMENSION v_samp_factor
    183 ; r13 = JDIMENSION width_blocks
    184 ; r14 = JSAMPARRAY input_data
    185 ; r15 = JSAMPARRAY output_data
    186 
    187         align   16
    188         global  EXTN(jsimd_h2v2_downsample_sse2)
    189 
    190 EXTN(jsimd_h2v2_downsample_sse2):
    191         push    rbp
    192         mov     rax,rsp
    193         mov     rbp,rsp
    194         collect_args
    195 
    196         mov     ecx, r13d
    197         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
    198         jz      near .return
    199 
    200         mov     edx, r10d
    201 
    202         ; -- expand_right_edge
    203 
    204         push    rcx
    205         shl     rcx,1                           ; output_cols * 2
    206         sub     rcx,rdx
    207         jle     short .expand_end
    208 
    209         mov     rax, r11
    210         test    rax,rax
    211         jle     short .expand_end
    212 
    213         cld
    214         mov     rsi, r14        ; input_data
    215 .expandloop:
    216         push    rax
    217         push    rcx
    218 
    219         mov     rdi, JSAMPROW [rsi]
    220         add     rdi,rdx
    221         mov     al, JSAMPLE [rdi-1]
    222 
    223         rep stosb
    224 
    225         pop     rcx
    226         pop     rax
    227 
    228         add     rsi, byte SIZEOF_JSAMPROW
    229         dec     rax
    230         jg      short .expandloop
    231 
    232 .expand_end:
    233         pop     rcx                             ; output_cols
    234 
    235         ; -- h2v2_downsample
    236 
    237         mov     eax, r12d        ; rowctr
    238         test    rax,rax
    239         jle     near .return
    240 
    241         mov     rdx, 0x00020001         ; bias pattern
    242         movd    xmm7,edx
    243         pcmpeqw xmm6,xmm6
    244         pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    245         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    246 
    247         mov     rsi, r14        ; input_data
    248         mov     rdi, r15        ; output_data
    249 .rowloop:
    250         push    rcx
    251         push    rdi
    252         push    rsi
    253 
    254         mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
    255         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
    256         mov     rdi, JSAMPROW [rdi]                     ; outptr
    257 
    258         cmp     rcx, byte SIZEOF_XMMWORD
    259         jae     short .columnloop
    260 
    261 .columnloop_r8:
    262         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    263         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    264         pxor    xmm2,xmm2
    265         pxor    xmm3,xmm3
    266         mov     rcx, SIZEOF_XMMWORD
    267         jmp     short .downsample
    268 
    269 .columnloop:
    270         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    271         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    272         movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    273         movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    274 
    275 .downsample:
    276         movdqa  xmm4,xmm0
    277         movdqa  xmm5,xmm1
    278         pand    xmm0,xmm6
    279         psrlw   xmm4,BYTE_BIT
    280         pand    xmm1,xmm6
    281         psrlw   xmm5,BYTE_BIT
    282         paddw   xmm0,xmm4
    283         paddw   xmm1,xmm5
    284 
    285         movdqa  xmm4,xmm2
    286         movdqa  xmm5,xmm3
    287         pand    xmm2,xmm6
    288         psrlw   xmm4,BYTE_BIT
    289         pand    xmm3,xmm6
    290         psrlw   xmm5,BYTE_BIT
    291         paddw   xmm2,xmm4
    292         paddw   xmm3,xmm5
    293 
    294         paddw   xmm0,xmm1
    295         paddw   xmm2,xmm3
    296         paddw   xmm0,xmm7
    297         paddw   xmm2,xmm7
    298         psrlw   xmm0,2
    299         psrlw   xmm2,2
    300 
    301         packuswb xmm0,xmm2
    302 
    303         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    304 
    305         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
    306         add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
    307         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
    308         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
    309         cmp     rcx, byte SIZEOF_XMMWORD
    310         jae     near .columnloop
    311         test    rcx,rcx
    312         jnz     near .columnloop_r8
    313 
    314         pop     rsi
    315         pop     rdi
    316         pop     rcx
    317 
    318         add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
    319         add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
    320         dec     rax                             ; rowctr
    321         jg      near .rowloop
    322 
    323 .return:
    324         uncollect_args
    325         pop     rbp
    326         ret
    327 
    328 ; For some reason, the OS X linker does not honor the request to align the
    329 ; segment unless we do this.
    330         align   16
    331