Home | History | Annotate | Download | only in x86_64
      1 ;
      2 ; jcsample.asm - downsampling (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, D. R. Commander.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jsimdext.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22     SECTION     SEG_TEXT
     23     BITS        64
     24 ;
     25 ; Downsample pixel values of a single component.
     26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     27 ; without smoothing.
     28 ;
     29 ; GLOBAL(void)
     30 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
     31 ;                            JDIMENSION v_samp_factor,
     32 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     33 ;                            JSAMPARRAY output_data);
     34 ;
     35 
     36 ; r10d = JDIMENSION image_width
     37 ; r11 = int max_v_samp_factor
     38 ; r12d = JDIMENSION v_samp_factor
     39 ; r13d = JDIMENSION width_in_blocks
     40 ; r14 = JSAMPARRAY input_data
     41 ; r15 = JSAMPARRAY output_data
     42 
     43     align       32
     44     GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
     45 
     46 EXTN(jsimd_h2v1_downsample_sse2):
     47     push        rbp
     48     mov         rax, rsp
     49     mov         rbp, rsp
     50     collect_args 6
     51 
     52     mov         ecx, r13d
     53     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
     54     jz          near .return
     55 
     56     mov         edx, r10d
     57 
     58     ; -- expand_right_edge
     59 
     60     push        rcx
     61     shl         rcx, 1                  ; output_cols * 2
     62     sub         rcx, rdx
     63     jle         short .expand_end
     64 
     65     mov         rax, r11
     66     test        rax, rax
     67     jle         short .expand_end
     68 
     69     cld
     70     mov         rsi, r14                ; input_data
     71 .expandloop:
     72     push        rax
     73     push        rcx
     74 
     75     mov         rdi, JSAMPROW [rsi]
     76     add         rdi, rdx
     77     mov         al, JSAMPLE [rdi-1]
     78 
     79     rep stosb
     80 
     81     pop         rcx
     82     pop         rax
     83 
     84     add         rsi, byte SIZEOF_JSAMPROW
     85     dec         rax
     86     jg          short .expandloop
     87 
     88 .expand_end:
     89     pop         rcx                     ; output_cols
     90 
     91     ; -- h2v1_downsample
     92 
     93     mov         eax, r12d               ; rowctr
     94     test        eax, eax
     95     jle         near .return
     96 
     97     mov         rdx, 0x00010000         ; bias pattern
     98     movd        xmm7, edx
     99     pcmpeqw     xmm6, xmm6
    100     pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    101     psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    102 
    103     mov         rsi, r14                ; input_data
    104     mov         rdi, r15                ; output_data
    105 .rowloop:
    106     push        rcx
    107     push        rdi
    108     push        rsi
    109 
    110     mov         rsi, JSAMPROW [rsi]     ; inptr
    111     mov         rdi, JSAMPROW [rdi]     ; outptr
    112 
    113     cmp         rcx, byte SIZEOF_XMMWORD
    114     jae         short .columnloop
    115 
    116 .columnloop_r8:
    117     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    118     pxor        xmm1, xmm1
    119     mov         rcx, SIZEOF_XMMWORD
    120     jmp         short .downsample
    121 
    122 .columnloop:
    123     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    124     movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    125 
    126 .downsample:
    127     movdqa      xmm2, xmm0
    128     movdqa      xmm3, xmm1
    129 
    130     pand        xmm0, xmm6
    131     psrlw       xmm2, BYTE_BIT
    132     pand        xmm1, xmm6
    133     psrlw       xmm3, BYTE_BIT
    134 
    135     paddw       xmm0, xmm2
    136     paddw       xmm1, xmm3
    137     paddw       xmm0, xmm7
    138     paddw       xmm1, xmm7
    139     psrlw       xmm0, 1
    140     psrlw       xmm1, 1
    141 
    142     packuswb    xmm0, xmm1
    143 
    144     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    145 
    146     sub         rcx, byte SIZEOF_XMMWORD    ; outcol
    147     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
    148     add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
    149     cmp         rcx, byte SIZEOF_XMMWORD
    150     jae         short .columnloop
    151     test        rcx, rcx
    152     jnz         short .columnloop_r8
    153 
    154     pop         rsi
    155     pop         rdi
    156     pop         rcx
    157 
    158     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
    159     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
    160     dec         rax                        ; rowctr
    161     jg          near .rowloop
    162 
    163 .return:
    164     uncollect_args 6
    165     pop         rbp
    166     ret
    167 
    168 ; --------------------------------------------------------------------------
    169 ;
    170 ; Downsample pixel values of a single component.
    171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    172 ; without smoothing.
    173 ;
    174 ; GLOBAL(void)
    175 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
    176 ;                            JDIMENSION v_samp_factor,
    177 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    178 ;                            JSAMPARRAY output_data);
    179 ;
    180 
    181 ; r10d = JDIMENSION image_width
    182 ; r11 = int max_v_samp_factor
    183 ; r12d = JDIMENSION v_samp_factor
    184 ; r13d = JDIMENSION width_in_blocks
    185 ; r14 = JSAMPARRAY input_data
    186 ; r15 = JSAMPARRAY output_data
    187 
    188     align       32
    189     GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
    190 
    191 EXTN(jsimd_h2v2_downsample_sse2):
    192     push        rbp
    193     mov         rax, rsp
    194     mov         rbp, rsp
    195     collect_args 6
    196 
    197     mov         ecx, r13d
    198     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
    199     jz          near .return
    200 
    201     mov         edx, r10d
    202 
    203     ; -- expand_right_edge
    204 
    205     push        rcx
    206     shl         rcx, 1                  ; output_cols * 2
    207     sub         rcx, rdx
    208     jle         short .expand_end
    209 
    210     mov         rax, r11
    211     test        rax, rax
    212     jle         short .expand_end
    213 
    214     cld
    215     mov         rsi, r14                ; input_data
    216 .expandloop:
    217     push        rax
    218     push        rcx
    219 
    220     mov         rdi, JSAMPROW [rsi]
    221     add         rdi, rdx
    222     mov         al, JSAMPLE [rdi-1]
    223 
    224     rep stosb
    225 
    226     pop         rcx
    227     pop         rax
    228 
    229     add         rsi, byte SIZEOF_JSAMPROW
    230     dec         rax
    231     jg          short .expandloop
    232 
    233 .expand_end:
    234     pop         rcx                     ; output_cols
    235 
    236     ; -- h2v2_downsample
    237 
    238     mov         eax, r12d               ; rowctr
    239     test        rax, rax
    240     jle         near .return
    241 
    242     mov         rdx, 0x00020001         ; bias pattern
    243     movd        xmm7, edx
    244     pcmpeqw     xmm6, xmm6
    245     pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
    246     psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
    247 
    248     mov         rsi, r14                ; input_data
    249     mov         rdi, r15                ; output_data
    250 .rowloop:
    251     push        rcx
    252     push        rdi
    253     push        rsi
    254 
    255     mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
    256     mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
    257     mov         rdi, JSAMPROW [rdi]                    ; outptr
    258 
    259     cmp         rcx, byte SIZEOF_XMMWORD
    260     jae         short .columnloop
    261 
    262 .columnloop_r8:
    263     movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    264     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    265     pxor        xmm2, xmm2
    266     pxor        xmm3, xmm3
    267     mov         rcx, SIZEOF_XMMWORD
    268     jmp         short .downsample
    269 
    270 .columnloop:
    271     movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
    272     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    273     movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
    274     movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    275 
    276 .downsample:
    277     movdqa      xmm4, xmm0
    278     movdqa      xmm5, xmm1
    279     pand        xmm0, xmm6
    280     psrlw       xmm4, BYTE_BIT
    281     pand        xmm1, xmm6
    282     psrlw       xmm5, BYTE_BIT
    283     paddw       xmm0, xmm4
    284     paddw       xmm1, xmm5
    285 
    286     movdqa      xmm4, xmm2
    287     movdqa      xmm5, xmm3
    288     pand        xmm2, xmm6
    289     psrlw       xmm4, BYTE_BIT
    290     pand        xmm3, xmm6
    291     psrlw       xmm5, BYTE_BIT
    292     paddw       xmm2, xmm4
    293     paddw       xmm3, xmm5
    294 
    295     paddw       xmm0, xmm1
    296     paddw       xmm2, xmm3
    297     paddw       xmm0, xmm7
    298     paddw       xmm2, xmm7
    299     psrlw       xmm0, 2
    300     psrlw       xmm2, 2
    301 
    302     packuswb    xmm0, xmm2
    303 
    304     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
    305 
    306     sub         rcx, byte SIZEOF_XMMWORD    ; outcol
    307     add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
    308     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
    309     add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
    310     cmp         rcx, byte SIZEOF_XMMWORD
    311     jae         near .columnloop
    312     test        rcx, rcx
    313     jnz         near .columnloop_r8
    314 
    315     pop         rsi
    316     pop         rdi
    317     pop         rcx
    318 
    319     add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
    320     add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
    321     dec         rax                          ; rowctr
    322     jg          near .rowloop
    323 
    324 .return:
    325     uncollect_args 6
    326     pop         rbp
    327     ret
    328 
    329 ; For some reason, the OS X linker does not honor the request to align the
    330 ; segment unless we do this.
    331     align       32
    332