Home | History | Annotate | Download | only in x86_64
      1 ;
      2 ; jccolext.asm - colorspace conversion (64-bit AVX2)
      3 ;
      4 ; Copyright (C) 2009, 2016, D. R. Commander.
      5 ; Copyright (C) 2015, Intel Corporation.
      6 ;
      7 ; Based on the x86 SIMD extension for IJG JPEG library
      8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     10 ;
     11 ; This file should be assembled with NASM (Netwide Assembler),
     12 ; can *not* be assembled with Microsoft's MASM or any compatible
     13 ; assembler (including Borland's Turbo Assembler).
     14 ; NASM is available from http://nasm.sourceforge.net/ or
     15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     16 ;
     17 ; [TAB8]
     18 
     19 %include "jcolsamp.inc"
     20 
     21 ; --------------------------------------------------------------------------
     22 ;
     23 ; Convert some rows of samples to the output colorspace.
     24 ;
     25 ; GLOBAL(void)
     26 ; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
     27 ;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
     28 ;                            int num_rows);
     29 ;
     30 
     31 ; r10d = JDIMENSION img_width
     32 ; r11 = JSAMPARRAY input_buf
     33 ; r12 = JSAMPIMAGE output_buf
     34 ; r13d = JDIMENSION output_row
     35 ; r14d = int num_rows
     36 
     37 %define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
     38 %define WK_NUM  8
     39 
     40     align       32
     41     GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
     42 
     43 EXTN(jsimd_rgb_ycc_convert_avx2):
     44     push        rbp
     45     mov         rax, rsp                     ; rax = original rbp
     46     sub         rsp, byte 4
     47     and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
     48     mov         [rsp], rax
     49     mov         rbp, rsp                     ; rbp = aligned rbp
     50     lea         rsp, [wk(0)]
     51     collect_args 5
     52     push        rbx
     53 
     54     mov         ecx, r10d
     55     test        rcx, rcx
     56     jz          near .return
     57 
     58     push        rcx
     59 
     60     mov         rsi, r12
     61     mov         ecx, r13d
     62     mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     63     mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
     64     mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     65     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     66     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     67     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
     68 
     69     pop         rcx
     70 
     71     mov         rsi, r11
     72     mov         eax, r14d
     73     test        rax, rax
     74     jle         near .return
     75 .rowloop:
     76     push        rdx
     77     push        rbx
     78     push        rdi
     79     push        rsi
     80     push        rcx                     ; col
     81 
     82     mov         rsi, JSAMPROW [rsi]     ; inptr
     83     mov         rdi, JSAMPROW [rdi]     ; outptr0
     84     mov         rbx, JSAMPROW [rbx]     ; outptr1
     85     mov         rdx, JSAMPROW [rdx]     ; outptr2
     86 
     87     cmp         rcx, byte SIZEOF_YMMWORD
     88     jae         near .columnloop
     89 
     90 %if RGB_PIXELSIZE == 3  ; ---------------
     91 
     92 .column_ld1:
     93     push        rax
     94     push        rdx
     95     lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
     96     test        cl, SIZEOF_BYTE
     97     jz          short .column_ld2
     98     sub         rcx, byte SIZEOF_BYTE
     99     movzx       rax, BYTE [rsi+rcx]
    100 .column_ld2:
    101     test        cl, SIZEOF_WORD
    102     jz          short .column_ld4
    103     sub         rcx, byte SIZEOF_WORD
    104     movzx       rdx, WORD [rsi+rcx]
    105     shl         rax, WORD_BIT
    106     or          rax, rdx
    107 .column_ld4:
    108     vmovd       xmmA, eax
    109     pop         rdx
    110     pop         rax
    111     test        cl, SIZEOF_DWORD
    112     jz          short .column_ld8
    113     sub         rcx, byte SIZEOF_DWORD
    114     vmovd       xmmF, XMM_DWORD [rsi+rcx]
    115     vpslldq     xmmA, xmmA, SIZEOF_DWORD
    116     vpor        xmmA, xmmA, xmmF
    117 .column_ld8:
    118     test        cl, SIZEOF_MMWORD
    119     jz          short .column_ld16
    120     sub         rcx, byte SIZEOF_MMWORD
    121     vmovq       xmmB, XMM_MMWORD [rsi+rcx]
    122     vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    123     vpor        xmmA, xmmA, xmmB
    124 .column_ld16:
    125     test        cl, SIZEOF_XMMWORD
    126     jz          short .column_ld32
    127     sub         rcx, byte SIZEOF_XMMWORD
    128     vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
    129     vperm2i128  ymmA, ymmA, ymmA, 1
    130     vpor        ymmA, ymmB
    131 .column_ld32:
    132     test        cl, SIZEOF_YMMWORD
    133     jz          short .column_ld64
    134     sub         rcx, byte SIZEOF_YMMWORD
    135     vmovdqa     ymmF, ymmA
    136     vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    137 .column_ld64:
    138     test        cl, 2*SIZEOF_YMMWORD
    139     mov         rcx, SIZEOF_YMMWORD
    140     jz          short .rgb_ycc_cnv
    141     vmovdqa     ymmB, ymmA
    142     vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    143     vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    144     jmp         short .rgb_ycc_cnv
    145 
    146 .columnloop:
    147     vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    148     vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    149     vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    150 
    151 .rgb_ycc_cnv:
    152     ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    153     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    154     ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    155     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    156     ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    157     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    158 
    159     vmovdqu     ymmC, ymmA
    160     vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
    161                                      ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
    162     vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
    163                                      ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    164     vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
    165                                      ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
    166     vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
    167                                      ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
    168 
    169     vmovdqa     ymmG, ymmA
    170     vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
    171                                   ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
    172     vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
    173                                   ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
    174 
    175     vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
    176                                   ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
    177     vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
    178                                   ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
    179 
    180     vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
    181                                   ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
    182     vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
    183                                   ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
    184 
    185     vmovdqa     ymmD, ymmA
    186     vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
    187                                   ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
    188     vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
    189                                   ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
    190 
    191     vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
    192                                   ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
    193     vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
    194                                   ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
    195 
    196     vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
    197                                   ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
    198     vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
    199                                   ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
    200 
    201     vmovdqa     ymmE, ymmA
    202     vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
    203                                   ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
    204     vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
    205                                   ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
    206 
    207     vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    208                                   ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    209     vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
    210                                   ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
    211 
    212     vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
    213                                   ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
    214     vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
    215                                   ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
    216 
    217     vpxor       ymmH, ymmH, ymmH
    218 
    219     vmovdqa     ymmC, ymmA
    220     vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    221     vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    222 
    223     vmovdqa     ymmB, ymmE
    224     vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    225     vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    226 
    227     vmovdqa     ymmF, ymmD
    228     vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    229     vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    230 
    231 %else  ; RGB_PIXELSIZE == 4 ; -----------
    232 
    233 .column_ld1:
    234     test        cl, SIZEOF_XMMWORD/16
    235     jz          short .column_ld2
    236     sub         rcx, byte SIZEOF_XMMWORD/16
    237     vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    238 .column_ld2:
    239     test        cl, SIZEOF_XMMWORD/8
    240     jz          short .column_ld4
    241     sub         rcx, byte SIZEOF_XMMWORD/8
    242     vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    243     vpslldq     xmmA, xmmA, SIZEOF_MMWORD
    244     vpor        xmmA, xmmA, xmmF
    245 .column_ld4:
    246     test        cl, SIZEOF_XMMWORD/4
    247     jz          short .column_ld8
    248     sub         rcx, byte SIZEOF_XMMWORD/4
    249     vmovdqa     xmmF, xmmA
    250     vperm2i128  ymmF, ymmF, ymmF, 1
    251     vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    252     vpor        ymmA, ymmA, ymmF
    253 .column_ld8:
    254     test        cl, SIZEOF_XMMWORD/2
    255     jz          short .column_ld16
    256     sub         rcx, byte SIZEOF_XMMWORD/2
    257     vmovdqa     ymmF, ymmA
    258     vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
    259 .column_ld16:
    260     test        cl, SIZEOF_XMMWORD
    261     mov         rcx, SIZEOF_YMMWORD
    262     jz          short .rgb_ycc_cnv
    263     vmovdqa     ymmE, ymmA
    264     vmovdqa     ymmH, ymmF
    265     vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    266     vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    267     jmp         short .rgb_ycc_cnv
    268 
    269 .columnloop:
    270     vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
    271     vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
    272     vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
    273     vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
    274 
    275 .rgb_ycc_cnv:
    276     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    277     ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    278     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    279     ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    280     ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
    281     ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    282     ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
    283     ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    284 
    285     vmovdqa     ymmB, ymmA
    286     vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    287                                         ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
    288     vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    289                                         ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
    290 
    291     vmovdqa     ymmB, ymmF
    292     vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
    293                                         ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
    294     vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
    295                                         ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
    296 
    297     vmovdqa     ymmD, ymmA
    298     vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
    299                                       ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
    300     vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
    301                                       ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
    302 
    303     vmovdqa     ymmC, ymmF
    304     vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
    305                                       ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
    306     vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
    307                                       ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
    308 
    309     vmovdqa     ymmB, ymmA
    310     vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
    311                                       ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
    312     vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
    313                                       ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
    314 
    315     vmovdqa     ymmG, ymmD
    316     vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
    317                                       ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
    318     vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
    319                                       ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
    320 
    321     vmovdqa     ymmE, ymmA
    322     vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
    323                                       ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
    324     vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
    325                                       ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
    326 
    327     vmovdqa     ymmH, ymmB
    328     vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
    329                                       ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
    330     vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
    331                                       ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
    332 
    333     vpxor       ymmF, ymmF, ymmF
    334 
    335     vmovdqa     ymmC, ymmA
    336     vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
    337     vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
    338 
    339     vmovdqa     ymmD, ymmB
    340     vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
    341     vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
    342 
    343     vmovdqa     ymmG, ymmE
    344     vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
    345     vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
    346 
    347     vpunpcklbw  ymmF, ymmF, ymmH
    348     vpunpckhbw  ymmH, ymmH, ymmH
    349     vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
    350     vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
    351 
    352 %endif  ; RGB_PIXELSIZE ; ---------------
    353 
    354     ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
    355     ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
    356 
    357     ; (Original)
    358     ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    359     ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    360     ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    361     ;
    362     ; (This implementation)
    363     ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    364     ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    365     ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    366 
    367     vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
    368     vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
    369     vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
    370     vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
    371 
    372     vmovdqa     ymm6, ymm1
    373     vpunpcklwd  ymm1, ymm1, ymm3
    374     vpunpckhwd  ymm6, ymm6, ymm3
    375     vmovdqa     ymm7, ymm1
    376     vmovdqa     ymm4, ymm6
    377     vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    378     vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    379     vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
    380     vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
    381 
    382     vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
    383     vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
    384 
    385     vpxor       ymm1, ymm1, ymm1
    386     vpxor       ymm6, ymm6, ymm6
    387     vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
    388     vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
    389     vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
    390     vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
    391 
    392     vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
    393 
    394     vpaddd      ymm7, ymm7, ymm1
    395     vpaddd      ymm4, ymm4, ymm6
    396     vpaddd      ymm7, ymm7, ymm5
    397     vpaddd      ymm4, ymm4, ymm5
    398     vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
    399     vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
    400     vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
    401 
    402     vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
    403 
    404     vmovdqa     ymm6, ymm0
    405     vpunpcklwd  ymm0, ymm0, ymm2
    406     vpunpckhwd  ymm6, ymm6, ymm2
    407     vmovdqa     ymm5, ymm0
    408     vmovdqa     ymm4, ymm6
    409     vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
    410     vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
    411     vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
    412     vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
    413 
    414     vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
    415     vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
    416 
    417     vpxor       ymm0, ymm0, ymm0
    418     vpxor       ymm6, ymm6, ymm6
    419     vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
    420     vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
    421     vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
    422     vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
    423 
    424     vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
    425 
    426     vpaddd      ymm5, ymm5, ymm0
    427     vpaddd      ymm4, ymm4, ymm6
    428     vpaddd      ymm5, ymm5, ymm1
    429     vpaddd      ymm4, ymm4, ymm1
    430     vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
    431     vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
    432     vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
    433 
    434     vpsllw      ymm7, ymm7, BYTE_BIT
    435     vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
    436     vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
    437 
    438     vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
    439     vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
    440     vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
    441 
    442     vmovdqa     ymm4, ymm0
    443     vpunpcklwd  ymm0, ymm0, ymm3
    444     vpunpckhwd  ymm4, ymm4, ymm3
    445     vmovdqa     ymm7, ymm0
    446     vmovdqa     ymm5, ymm4
    447     vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    448     vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    449     vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
    450     vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
    451 
    452     vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
    453 
    454     vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
    455     vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
    456     vpaddd      ymm0, ymm0, ymm3
    457     vpaddd      ymm4, ymm4, ymm3
    458     vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
    459     vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
    460     vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
    461 
    462     vpxor       ymm3, ymm3, ymm3
    463     vpxor       ymm4, ymm4, ymm4
    464     vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
    465     vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
    466     vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
    467     vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
    468 
    469     vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
    470 
    471     vpaddd      ymm7, ymm7, ymm3
    472     vpaddd      ymm5, ymm5, ymm4
    473     vpaddd      ymm7, ymm7, ymm1
    474     vpaddd      ymm5, ymm5, ymm1
    475     vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
    476     vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
    477     vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
    478 
    479     vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
    480 
    481     vmovdqa     ymm4, ymm6
    482     vpunpcklwd  ymm6, ymm6, ymm2
    483     vpunpckhwd  ymm4, ymm4, ymm2
    484     vmovdqa     ymm1, ymm6
    485     vmovdqa     ymm5, ymm4
    486     vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    487     vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    488     vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
    489     vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
    490 
    491     vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
    492 
    493     vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
    494     vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
    495     vpaddd      ymm6, ymm6, ymm2
    496     vpaddd      ymm4, ymm4, ymm2
    497     vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
    498     vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
    499     vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
    500 
    501     vpsllw      ymm0, ymm0, BYTE_BIT
    502     vpor        ymm6, ymm6, ymm0        ; ymm6=Y
    503     vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
    504 
    505     vpxor       ymm2, ymm2, ymm2
    506     vpxor       ymm4, ymm4, ymm4
    507     vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
    508     vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
    509     vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
    510     vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
    511 
    512     vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
    513 
    514     vpaddd      ymm1, ymm1, ymm2
    515     vpaddd      ymm5, ymm5, ymm4
    516     vpaddd      ymm1, ymm1, ymm0
    517     vpaddd      ymm5, ymm5, ymm0
    518     vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
    519     vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
    520     vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
    521 
    522     vpsllw      ymm7, ymm7, BYTE_BIT
    523     vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
    524     vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
    525 
    526     sub         rcx, byte SIZEOF_YMMWORD
    527     add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
    528     add         rdi, byte SIZEOF_YMMWORD           ; outptr0
    529     add         rbx, byte SIZEOF_YMMWORD           ; outptr1
    530     add         rdx, byte SIZEOF_YMMWORD           ; outptr2
    531     cmp         rcx, byte SIZEOF_YMMWORD
    532     jae         near .columnloop
    533     test        rcx, rcx
    534     jnz         near .column_ld1
    535 
    536     pop         rcx                     ; col
    537     pop         rsi
    538     pop         rdi
    539     pop         rbx
    540     pop         rdx
    541 
    542     add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
    543     add         rdi, byte SIZEOF_JSAMPROW
    544     add         rbx, byte SIZEOF_JSAMPROW
    545     add         rdx, byte SIZEOF_JSAMPROW
    546     dec         rax                        ; num_rows
    547     jg          near .rowloop
    548 
    549 .return:
    550     pop         rbx
    551     vzeroupper
    552     uncollect_args 5
    553     mov         rsp, rbp                ; rsp <- aligned rbp
    554     pop         rsp                     ; rsp <- original rbp
    555     pop         rbp
    556     ret
    557 
    558 ; For some reason, the OS X linker does not honor the request to align the
    559 ; segment unless we do this.
    560     align       32
    561