Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcsample.asm - downsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22         SECTION SEG_TEXT
     23         BITS    64
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
     31 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
     32 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
     33 ;
     34 
     35 ; r10 = JDIMENSION image_width
     36 ; r11 = int max_v_samp_factor
     37 ; r12 = JDIMENSION v_samp_factor
     38 ; r13 = JDIMENSION width_blocks
     39 ; r14 = JSAMPARRAY input_data
     40 ; r15 = JSAMPARRAY output_data
     41 
     42         align   16
     43         global  EXTN(jsimd_h2v1_downsample_sse2)
     44 
     45 EXTN(jsimd_h2v1_downsample_sse2):
     46         push    rbp
     47         mov     rax,rsp
     48         mov     rbp,rsp
     49         collect_args
     50 
     51         mov ecx, r13d
     52         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
     53         jz      near .return
     54 
     55         mov edx, r10d
     56 
     57         ; -- expand_right_edge
     58 
     59         push    rcx
     60         shl     rcx,1                           ; output_cols * 2
     61         sub     rcx,rdx
     62         jle     short .expand_end
     63 
     64         mov     rax, r11
     65         test    rax,rax
     66         jle     short .expand_end
     67 
     68         cld
     69         mov     rsi, r14        ; input_data
     70 .expandloop:
     71         push    rax
     72         push    rcx
     73 
     74         mov     rdi, JSAMPROW [rsi]
     75         add     rdi,rdx
     76         mov     al, JSAMPLE [rdi-1]
     77 
     78         rep stosb
     79 
     80         pop     rcx
     81         pop     rax
     82 
     83         add     rsi, byte SIZEOF_JSAMPROW
     84         dec     rax
     85         jg      short .expandloop
     86 
     87 .expand_end:
     88         pop     rcx                             ; output_cols
     89 
     90         ; -- h2v1_downsample
     91 
     92         mov     eax, r12d        ; rowctr
     93         test    eax,eax
     94         jle     near .return
     95 
     96         mov     rdx, 0x00010000         ; bias pattern
     97         movd    xmm7,edx
     98         pcmpeqw xmm6,xmm6
     99         pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    100         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    101 
    102         mov     rsi, r14        ; input_data
    103         mov     rdi, r15        ; output_data
    104 .rowloop:
    105         push    rcx
    106         push    rdi
    107         push    rsi
    108 
    109         mov     rsi, JSAMPROW [rsi]             ; inptr
    110         mov rdi, JSAMPROW [rdi]         ; outptr
    111 
    112         cmp     rcx, byte SIZEOF_XMMWORD
    113         jae     short .columnloop
    114 
    115 .columnloop_r8:
    116         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    117         pxor    xmm1,xmm1
    118         mov     rcx, SIZEOF_XMMWORD
    119         jmp     short .downsample
    120 
    121 .columnloop:
    122         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    123         movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    124 
    125 .downsample:
    126         movdqa  xmm2,xmm0
    127         movdqa  xmm3,xmm1
    128 
    129         pand    xmm0,xmm6
    130         psrlw   xmm2,BYTE_BIT
    131         pand    xmm1,xmm6
    132         psrlw   xmm3,BYTE_BIT
    133 
    134         paddw   xmm0,xmm2
    135         paddw   xmm1,xmm3
    136         paddw   xmm0,xmm7
    137         paddw   xmm1,xmm7
    138         psrlw   xmm0,1
    139         psrlw   xmm1,1
    140 
    141         packuswb xmm0,xmm1
    142 
    143         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    144 
    145         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
    146         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
    147         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
    148         cmp     rcx, byte SIZEOF_XMMWORD
    149         jae     short .columnloop
    150         test    rcx,rcx
    151         jnz     short .columnloop_r8
    152 
    153         pop     rsi
    154         pop     rdi
    155         pop     rcx
    156 
    157         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
    158         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
    159         dec     rax                             ; rowctr
    160         jg      near .rowloop
    161 
    162 .return:
    163         uncollect_args
    164         pop     rbp
    165         ret
    166 
    167 ; --------------------------------------------------------------------------
    168 ;
    169 ; Downsample pixel values of a single component.
    170 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    171 ; without smoothing.
    172 ;
    173 ; GLOBAL(void)
    174 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    175 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    176 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    177 ;
    178 
    179 ; r10 = JDIMENSION image_width
    180 ; r11 = int max_v_samp_factor
    181 ; r12 = JDIMENSION v_samp_factor
    182 ; r13 = JDIMENSION width_blocks
    183 ; r14 = JSAMPARRAY input_data
    184 ; r15 = JSAMPARRAY output_data
    185 
    186         align   16
    187         global  EXTN(jsimd_h2v2_downsample_sse2)
    188 
    189 EXTN(jsimd_h2v2_downsample_sse2):
    190         push    rbp
    191         mov     rax,rsp
    192         mov     rbp,rsp
    193         collect_args
    194 
    195         mov     ecx, r13d
    196         shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
    197         jz      near .return
    198 
    199         mov     edx, r10d
    200 
    201         ; -- expand_right_edge
    202 
    203         push    rcx
    204         shl     rcx,1                           ; output_cols * 2
    205         sub     rcx,rdx
    206         jle     short .expand_end
    207 
    208         mov     rax, r11
    209         test    rax,rax
    210         jle     short .expand_end
    211 
    212         cld
    213         mov     rsi, r14        ; input_data
    214 .expandloop:
    215         push    rax
    216         push    rcx
    217 
    218         mov     rdi, JSAMPROW [rsi]
    219         add     rdi,rdx
    220         mov     al, JSAMPLE [rdi-1]
    221 
    222         rep stosb
    223 
    224         pop     rcx
    225         pop     rax
    226 
    227         add     rsi, byte SIZEOF_JSAMPROW
    228         dec     rax
    229         jg      short .expandloop
    230 
    231 .expand_end:
    232         pop     rcx                             ; output_cols
    233 
    234         ; -- h2v2_downsample
    235 
    236         mov     eax, r12d        ; rowctr
    237         test    rax,rax
    238         jle     near .return
    239 
    240         mov     rdx, 0x00020001         ; bias pattern
    241         movd    xmm7,edx
    242         pcmpeqw xmm6,xmm6
    243         pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    244         psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    245 
    246         mov     rsi, r14        ; input_data
    247         mov     rdi, r15        ; output_data
    248 .rowloop:
    249         push    rcx
    250         push    rdi
    251         push    rsi
    252 
    253         mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
    254         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
    255         mov     rdi, JSAMPROW [rdi]                     ; outptr
    256 
    257         cmp     rcx, byte SIZEOF_XMMWORD
    258         jae     short .columnloop
    259 
    260 .columnloop_r8:
    261         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    262         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    263         pxor    xmm2,xmm2
    264         pxor    xmm3,xmm3
    265         mov     rcx, SIZEOF_XMMWORD
    266         jmp     short .downsample
    267 
    268 .columnloop:
    269         movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    270         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    271         movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    272         movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    273 
    274 .downsample:
    275         movdqa  xmm4,xmm0
    276         movdqa  xmm5,xmm1
    277         pand    xmm0,xmm6
    278         psrlw   xmm4,BYTE_BIT
    279         pand    xmm1,xmm6
    280         psrlw   xmm5,BYTE_BIT
    281         paddw   xmm0,xmm4
    282         paddw   xmm1,xmm5
    283 
    284         movdqa  xmm4,xmm2
    285         movdqa  xmm5,xmm3
    286         pand    xmm2,xmm6
    287         psrlw   xmm4,BYTE_BIT
    288         pand    xmm3,xmm6
    289         psrlw   xmm5,BYTE_BIT
    290         paddw   xmm2,xmm4
    291         paddw   xmm3,xmm5
    292 
    293         paddw   xmm0,xmm1
    294         paddw   xmm2,xmm3
    295         paddw   xmm0,xmm7
    296         paddw   xmm2,xmm7
    297         psrlw   xmm0,2
    298         psrlw   xmm2,2
    299 
    300         packuswb xmm0,xmm2
    301 
    302         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    303 
    304         sub     rcx, byte SIZEOF_XMMWORD        ; outcol
    305         add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
    306         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
    307         add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
    308         cmp     rcx, byte SIZEOF_XMMWORD
    309         jae     near .columnloop
    310         test    rcx,rcx
    311         jnz     near .columnloop_r8
    312 
    313         pop     rsi
    314         pop     rdi
    315         pop     rcx
    316 
    317         add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
    318         add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
    319         dec     rax                             ; rowctr
    320         jg      near .rowloop
    321 
    322 .return:
    323         uncollect_args
    324         pop     rbp
    325         ret
    326 
    327 ; For some reason, the OS X linker does not honor the request to align the
    328 ; segment unless we do this.
    329         align   16
    330