Home | History | Annotate | Download | only in i386
      1 ;
      2 ; jcsample.asm - downsampling (AVX2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
      5 ; Copyright (C) 2015, Intel Corporation.
      6 ; Copyright (C) 2016, D. R. Commander.
      7 ;
      8 ; Based on the x86 SIMD extension for IJG JPEG library
      9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     11 ;
     12 ; This file should be assembled with NASM (Netwide Assembler),
     13 ; can *not* be assembled with Microsoft's MASM or any compatible
     14 ; assembler (including Borland's Turbo Assembler).
     15 ; NASM is available from http://nasm.sourceforge.net/ or
     16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     17 ;
     18 ; [TAB8]
     19 
     20 %include "jsimdext.inc"
     21 
     22 ; --------------------------------------------------------------------------
     23     SECTION     SEG_TEXT
     24     BITS        32
     25 ;
     26 ; Downsample pixel values of a single component.
     27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
     28 ; without smoothing.
     29 ;
     30 ; GLOBAL(void)
     31 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
     32 ;                            JDIMENSION v_samp_factor,
     33 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
     34 ;                            JSAMPARRAY output_data);
     35 ;
     36 
     37 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
     38 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
     39 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
     40 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
     41 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
     42 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
     43 
     44     align       32
     45     GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
     46 
     47 EXTN(jsimd_h2v1_downsample_avx2):
     48     push        ebp
     49     mov         ebp, esp
     50 ;   push        ebx                     ; unused
     51 ;   push        ecx                     ; need not be preserved
     52 ;   push        edx                     ; need not be preserved
     53     push        esi
     54     push        edi
     55 
     56     mov         ecx, JDIMENSION [width_blks(ebp)]
     57     shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
     58     jz          near .return
     59 
     60     mov         edx, JDIMENSION [img_width(ebp)]
     61 
     62     ; -- expand_right_edge
     63 
     64     push        ecx
     65     shl         ecx, 1                  ; output_cols * 2
     66     sub         ecx, edx
     67     jle         short .expand_end
     68 
     69     mov         eax, INT [max_v_samp(ebp)]
     70     test        eax, eax
     71     jle         short .expand_end
     72 
     73     cld
     74     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
     75     alignx      16, 7
     76 .expandloop:
     77     push        eax
     78     push        ecx
     79 
     80     mov         edi, JSAMPROW [esi]
     81     add         edi, edx
     82     mov         al, JSAMPLE [edi-1]
     83 
     84     rep stosb
     85 
     86     pop         ecx
     87     pop         eax
     88 
     89     add         esi, byte SIZEOF_JSAMPROW
     90     dec         eax
     91     jg          short .expandloop
     92 
     93 .expand_end:
     94     pop         ecx                     ; output_cols
     95 
     96     ; -- h2v1_downsample
     97 
     98     mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
     99     test        eax, eax
    100     jle         near .return
    101 
    102     mov         edx, 0x00010000         ; bias pattern
    103     vmovd       xmm7, edx
    104     vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
    105     vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
    106     vpcmpeqw    ymm6, ymm6, ymm6
    107     vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
    108 
    109     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    110     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    111     alignx      16, 7
    112 .rowloop:
    113     push        ecx
    114     push        edi
    115     push        esi
    116 
    117     mov         esi, JSAMPROW [esi]     ; inptr
    118     mov         edi, JSAMPROW [edi]     ; outptr
    119 
    120     cmp         ecx, byte SIZEOF_YMMWORD
    121     jae         short .columnloop
    122     alignx      16, 7
    123 
    124 .columnloop_r24:
    125     ; ecx can possibly be 8, 16, 24
    126     cmp         ecx, 24
    127     jne         .columnloop_r16
    128     vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    129     vmovdqu     xmm1, XMMWORD [esi+1*SIZEOF_YMMWORD]
    130     mov         ecx, SIZEOF_YMMWORD
    131     jmp         short .downsample
    132 
    133 .columnloop_r16:
    134     cmp         ecx, 16
    135     jne         .columnloop_r8
    136     vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    137     vpxor       ymm1, ymm1, ymm1
    138     mov         ecx, SIZEOF_YMMWORD
    139     jmp         short .downsample
    140 
    141 .columnloop_r8:
    142     vmovdqu     xmm0, XMMWORD[esi+0*SIZEOF_YMMWORD]
    143     vpxor       ymm1, ymm1, ymm1
    144     mov         ecx, SIZEOF_YMMWORD
    145     jmp         short .downsample
    146     alignx      16, 7
    147 
    148 .columnloop:
    149     vmovdqu     ymm0, YMMWORD [esi+0*SIZEOF_YMMWORD]
    150     vmovdqu     ymm1, YMMWORD [esi+1*SIZEOF_YMMWORD]
    151 
    152 .downsample:
    153     vpsrlw      ymm2, ymm0, BYTE_BIT
    154     vpand       ymm0, ymm0, ymm6
    155     vpsrlw      ymm3, ymm1, BYTE_BIT
    156     vpand       ymm1, ymm1, ymm6
    157 
    158     vpaddw      ymm0, ymm0, ymm2
    159     vpaddw      ymm1, ymm1, ymm3
    160     vpaddw      ymm0, ymm0, ymm7
    161     vpaddw      ymm1, ymm1, ymm7
    162     vpsrlw      ymm0, ymm0, 1
    163     vpsrlw      ymm1, ymm1, 1
    164 
    165     vpackuswb   ymm0, ymm0, ymm1
    166     vpermq      ymm0, ymm0, 0xd8
    167 
    168     vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
    169 
    170     sub         ecx, byte SIZEOF_YMMWORD    ; outcol
    171     add         esi, byte 2*SIZEOF_YMMWORD  ; inptr
    172     add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
    173     cmp         ecx, byte SIZEOF_YMMWORD
    174     jae         short .columnloop
    175     test        ecx, ecx
    176     jnz         near .columnloop_r24
    177 
    178     pop         esi
    179     pop         edi
    180     pop         ecx
    181 
    182     add         esi, byte SIZEOF_JSAMPROW  ; input_data
    183     add         edi, byte SIZEOF_JSAMPROW  ; output_data
    184     dec         eax                        ; rowctr
    185     jg          near .rowloop
    186 
    187 .return:
    188     vzeroupper
    189     pop         edi
    190     pop         esi
    191 ;   pop         edx                     ; need not be preserved
    192 ;   pop         ecx                     ; need not be preserved
    193 ;   pop         ebx                     ; unused
    194     pop         ebp
    195     ret
    196 
    197 ; --------------------------------------------------------------------------
    198 ;
    199 ; Downsample pixel values of a single component.
    200 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
    201 ; without smoothing.
    202 ;
    203 ; GLOBAL(void)
    204 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
    205 ;                            JDIMENSION v_samp_factor,
    206 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
    207 ;                            JSAMPARRAY output_data);
    208 ;
    209 
    210 %define img_width(b)    (b) + 8         ; JDIMENSION image_width
    211 %define max_v_samp(b)   (b) + 12        ; int max_v_samp_factor
    212 %define v_samp(b)       (b) + 16        ; JDIMENSION v_samp_factor
    213 %define width_blks(b)   (b) + 20        ; JDIMENSION width_in_blocks
    214 %define input_data(b)   (b) + 24        ; JSAMPARRAY input_data
    215 %define output_data(b)  (b) + 28        ; JSAMPARRAY output_data
    216 
    217     align       32
    218     GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
    219 
    220 EXTN(jsimd_h2v2_downsample_avx2):
    221     push        ebp
    222     mov         ebp, esp
    223 ;   push        ebx                     ; unused
    224 ;   push        ecx                     ; need not be preserved
    225 ;   push        edx                     ; need not be preserved
    226     push        esi
    227     push        edi
    228 
    229     mov         ecx, JDIMENSION [width_blks(ebp)]
    230     shl         ecx, 3                  ; imul ecx,DCTSIZE (ecx = output_cols)
    231     jz          near .return
    232 
    233     mov         edx, JDIMENSION [img_width(ebp)]
    234 
    235     ; -- expand_right_edge
    236 
    237     push        ecx
    238     shl         ecx, 1                  ; output_cols * 2
    239     sub         ecx, edx
    240     jle         short .expand_end
    241 
    242     mov         eax, INT [max_v_samp(ebp)]
    243     test        eax, eax
    244     jle         short .expand_end
    245 
    246     cld
    247     mov         esi, JSAMPARRAY [input_data(ebp)]  ; input_data
    248     alignx      16, 7
    249 .expandloop:
    250     push        eax
    251     push        ecx
    252 
    253     mov         edi, JSAMPROW [esi]
    254     add         edi, edx
    255     mov         al, JSAMPLE [edi-1]
    256 
    257     rep stosb
    258 
    259     pop         ecx
    260     pop         eax
    261 
    262     add         esi, byte SIZEOF_JSAMPROW
    263     dec         eax
    264     jg          short .expandloop
    265 
    266 .expand_end:
    267     pop         ecx                     ; output_cols
    268 
    269     ; -- h2v2_downsample
    270 
    271     mov         eax, JDIMENSION [v_samp(ebp)]  ; rowctr
    272     test        eax, eax
    273     jle         near .return
    274 
    275     mov         edx, 0x00020001         ; bias pattern
    276     vmovd       xmm7, edx
    277     vpcmpeqw    ymm6, ymm6, ymm6
    278     vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
    279     vperm2i128  ymm7, ymm7, ymm7, 0
    280     vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
    281 
    282     mov         esi, JSAMPARRAY [input_data(ebp)]   ; input_data
    283     mov         edi, JSAMPARRAY [output_data(ebp)]  ; output_data
    284     alignx      16, 7
    285 .rowloop:
    286     push        ecx
    287     push        edi
    288     push        esi
    289 
    290     mov         edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; inptr0
    291     mov         esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; inptr1
    292     mov         edi, JSAMPROW [edi]                    ; outptr
    293 
    294     cmp         ecx, byte SIZEOF_YMMWORD
    295     jae         short .columnloop
    296     alignx      16, 7
    297 
    298 .columnloop_r24:
    299     cmp         ecx, 24
    300     jne         .columnloop_r16
    301     vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    302     vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    303     vmovdqu     xmm2, XMMWORD [edx+1*SIZEOF_YMMWORD]
    304     vmovdqu     xmm3, XMMWORD [esi+1*SIZEOF_YMMWORD]
    305     mov         ecx, SIZEOF_YMMWORD
    306     jmp         short .downsample
    307 
    308 .columnloop_r16:
    309     cmp         ecx, 16
    310     jne         .columnloop_r8
    311     vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    312     vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    313     vpxor       ymm2, ymm2, ymm2
    314     vpxor       ymm3, ymm3, ymm3
    315     mov         ecx, SIZEOF_YMMWORD
    316     jmp         short .downsample
    317 
    318 .columnloop_r8:
    319     vmovdqu     xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
    320     vmovdqu     xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
    321     vpxor       ymm2, ymm2, ymm2
    322     vpxor       ymm3, ymm3, ymm3
    323     mov         ecx, SIZEOF_YMMWORD
    324     jmp         short .downsample
    325     alignx      16, 7
    326 
    327 .columnloop:
    328     vmovdqu     ymm0, YMMWORD [edx+0*SIZEOF_YMMWORD]
    329     vmovdqu     ymm1, YMMWORD [esi+0*SIZEOF_YMMWORD]
    330     vmovdqu     ymm2, YMMWORD [edx+1*SIZEOF_YMMWORD]
    331     vmovdqu     ymm3, YMMWORD [esi+1*SIZEOF_YMMWORD]
    332 
    333 .downsample:
    334     vpand       ymm4, ymm0, ymm6
    335     vpsrlw      ymm0, ymm0, BYTE_BIT
    336     vpand       ymm5, ymm1, ymm6
    337     vpsrlw      ymm1, ymm1, BYTE_BIT
    338     vpaddw      ymm0, ymm0, ymm4
    339     vpaddw      ymm1, ymm1, ymm5
    340 
    341     vpand       ymm4, ymm2, ymm6
    342     vpsrlw      ymm2, ymm2, BYTE_BIT
    343     vpand       ymm5, ymm3, ymm6
    344     vpsrlw      ymm3, ymm3, BYTE_BIT
    345     vpaddw      ymm2, ymm2, ymm4
    346     vpaddw      ymm3, ymm3, ymm5
    347 
    348     vpaddw      ymm0, ymm0, ymm1
    349     vpaddw      ymm2, ymm2, ymm3
    350     vpaddw      ymm0, ymm0, ymm7
    351     vpaddw      ymm2, ymm2, ymm7
    352     vpsrlw      ymm0, ymm0, 2
    353     vpsrlw      ymm2, ymm2, 2
    354 
    355     vpackuswb   ymm0, ymm0, ymm2
    356     vpermq      ymm0, ymm0, 0xd8
    357 
    358     vmovdqu     YMMWORD [edi+0*SIZEOF_YMMWORD], ymm0
    359 
    360     sub         ecx, byte SIZEOF_YMMWORD    ; outcol
    361     add         edx, byte 2*SIZEOF_YMMWORD  ; inptr0
    362     add         esi, byte 2*SIZEOF_YMMWORD  ; inptr1
    363     add         edi, byte 1*SIZEOF_YMMWORD  ; outptr
    364     cmp         ecx, byte SIZEOF_YMMWORD
    365     jae         near .columnloop
    366     test        ecx, ecx
    367     jnz         near .columnloop_r24
    368 
    369     pop         esi
    370     pop         edi
    371     pop         ecx
    372 
    373     add         esi, byte 2*SIZEOF_JSAMPROW  ; input_data
    374     add         edi, byte 1*SIZEOF_JSAMPROW  ; output_data
    375     dec         eax                          ; rowctr
    376     jg          near .rowloop
    377 
    378 .return:
    379     vzeroupper
    380     pop         edi
    381     pop         esi
    382 ;   pop         edx                     ; need not be preserved
    383 ;   pop         ecx                     ; need not be preserved
    384 ;   pop         ebx                     ; unused
    385     pop         ebp
    386     ret
    387 
    388 ; For some reason, the OS X linker does not honor the request to align the
    389 ; segment unless we do this.
    390     align       32
    391