Home | History | Annotate | Download | only in simd
      1 ;
      2 ; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2)
      3 ;
      4 ; x86 SIMD extension for IJG JPEG library
      5 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
      6 ; Copyright (C) 2011, D. R. Commander.
      7 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
      8 ;
      9 ; This file should be assembled with NASM (Netwide Assembler),
     10 ; can *not* be assembled with Microsoft's MASM or any compatible
     11 ; assembler (including Borland's Turbo Assembler).
     12 ; NASM is available from http://nasm.sourceforge.net/ or
     13 ; http://sourceforge.net/project/showfiles.php?group_id=6208
     14 ;
     15 ; [TAB8]
     16 
     17 %include "jcolsamp.inc"
     18 
     19 ; --------------------------------------------------------------------------
     20 ;
     21 ; Convert some rows of samples to the output colorspace.
     22 ;
     23 ; GLOBAL(void)
     24 ; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
     25 ;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
     26 ;                              JDIMENSION output_row, int num_rows);
     27 ;
     28 
     29 ; r10 = JDIMENSION img_width
     30 ; r11 = JSAMPARRAY input_buf
     31 ; r12 = JSAMPIMAGE output_buf
     32 ; r13 = JDIMENSION output_row
     33 ; r14 = int num_rows
     34 
     35 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
     36 %define WK_NUM		2
     37 
     38 	align	16
     39 
     40 	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
     41 
     42 EXTN(jsimd_rgb_gray_convert_sse2):
     43 	push	rbp
     44 	mov	rax,rsp				; rax = original rbp
     45 	sub	rsp, byte 4
     46 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
     47 	mov	[rsp],rax
     48 	mov	rbp,rsp				; rbp = aligned rbp
     49 	lea	rsp, [wk(0)]
     50 	collect_args
     51 	push	rbx
     52 
     53 	mov	rcx, r10
     54 	test	rcx,rcx
     55 	jz	near .return
     56 
     57 	push	rcx
     58 
     59 	mov rsi, r12
     60 	mov rcx, r13
     61 	mov	rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     62 	lea	rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     63 
     64 	pop	rcx
     65 
     66 	mov rsi, r11
     67 	mov	eax, r14d
     68 	test	rax,rax
     69 	jle	near .return
     70 .rowloop:
     71 	push	rdi
     72 	push	rsi
     73 	push	rcx			; col
     74 
     75 	mov	rsi, JSAMPROW [rsi]	; inptr
     76 	mov	rdi, JSAMPROW [rdi]	; outptr0
     77 
     78 	cmp	rcx, byte SIZEOF_XMMWORD
     79 	jae	near .columnloop
     80 
     81 %if RGB_PIXELSIZE == 3 ; ---------------
     82 
     83 .column_ld1:
     84 	push	rax
     85 	push	rdx
     86 	lea	rcx,[rcx+rcx*2]		; imul ecx,RGB_PIXELSIZE
     87 	test	cl, SIZEOF_BYTE
     88 	jz	short .column_ld2
     89 	sub	rcx, byte SIZEOF_BYTE
     90 	movzx	rax, BYTE [rsi+rcx]
     91 .column_ld2:
     92 	test	cl, SIZEOF_WORD
     93 	jz	short .column_ld4
     94 	sub	rcx, byte SIZEOF_WORD
     95 	movzx	rdx, WORD [rsi+rcx]
     96 	shl	rax, WORD_BIT
     97 	or	rax,rdx
     98 .column_ld4:
     99 	movd	xmmA,eax
    100 	pop	rdx
    101 	pop	rax
    102 	test	cl, SIZEOF_DWORD
    103 	jz	short .column_ld8
    104 	sub	rcx, byte SIZEOF_DWORD
    105 	movd	xmmF, XMM_DWORD [rsi+rcx]
    106 	pslldq	xmmA, SIZEOF_DWORD
    107 	por	xmmA,xmmF
    108 .column_ld8:
    109 	test	cl, SIZEOF_MMWORD
    110 	jz	short .column_ld16
    111 	sub	rcx, byte SIZEOF_MMWORD
    112 	movq	xmmB, XMM_MMWORD [rsi+rcx]
    113 	pslldq	xmmA, SIZEOF_MMWORD
    114 	por	xmmA,xmmB
    115 .column_ld16:
    116 	test	cl, SIZEOF_XMMWORD
    117 	jz	short .column_ld32
    118 	movdqa	xmmF,xmmA
    119 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    120 	mov	rcx, SIZEOF_XMMWORD
    121 	jmp	short .rgb_gray_cnv
    122 .column_ld32:
    123 	test	cl, 2*SIZEOF_XMMWORD
    124 	mov	rcx, SIZEOF_XMMWORD
    125 	jz	short .rgb_gray_cnv
    126 	movdqa	xmmB,xmmA
    127 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    128 	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    129 	jmp	short .rgb_gray_cnv
    130 
    131 .columnloop:
    132 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    133 	movdqu	xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    134 	movdqu	xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    135 
    136 .rgb_gray_cnv:
    137 	; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
    138 	; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
    139 	; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
    140 
    141 	movdqa    xmmG,xmmA
    142 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
    143 	psrldq    xmmG,8	; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
    144 
    145 	punpckhbw xmmA,xmmF	; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
    146 	pslldq    xmmF,8	; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
    147 
    148 	punpcklbw xmmG,xmmB	; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
    149 	punpckhbw xmmF,xmmB	; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
    150 
    151 	movdqa    xmmD,xmmA
    152 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
    153 	psrldq    xmmD,8	; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
    154 
    155 	punpckhbw xmmA,xmmG	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
    156 	pslldq    xmmG,8	; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
    157 
    158 	punpcklbw xmmD,xmmF	; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
    159 	punpckhbw xmmG,xmmF	; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
    160 
    161 	movdqa    xmmE,xmmA
    162 	pslldq    xmmA,8	; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
    163 	psrldq    xmmE,8	; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
    164 
    165 	punpckhbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    166 	pslldq    xmmD,8	; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
    167 
    168 	punpcklbw xmmE,xmmG	; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
    169 	punpckhbw xmmD,xmmG	; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
    170 
    171 	pxor      xmmH,xmmH
    172 
    173 	movdqa    xmmC,xmmA
    174 	punpcklbw xmmA,xmmH	; xmmA=(00 02 04 06 08 0A 0C 0E)
    175 	punpckhbw xmmC,xmmH	; xmmC=(10 12 14 16 18 1A 1C 1E)
    176 
    177 	movdqa    xmmB,xmmE
    178 	punpcklbw xmmE,xmmH	; xmmE=(20 22 24 26 28 2A 2C 2E)
    179 	punpckhbw xmmB,xmmH	; xmmB=(01 03 05 07 09 0B 0D 0F)
    180 
    181 	movdqa    xmmF,xmmD
    182 	punpcklbw xmmD,xmmH	; xmmD=(11 13 15 17 19 1B 1D 1F)
    183 	punpckhbw xmmF,xmmH	; xmmF=(21 23 25 27 29 2B 2D 2F)
    184 
    185 %else ; RGB_PIXELSIZE == 4 ; -----------
    186 
    187 .column_ld1:
    188 	test	cl, SIZEOF_XMMWORD/16
    189 	jz	short .column_ld2
    190 	sub	rcx, byte SIZEOF_XMMWORD/16
    191 	movd	xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
    192 .column_ld2:
    193 	test	cl, SIZEOF_XMMWORD/8
    194 	jz	short .column_ld4
    195 	sub	rcx, byte SIZEOF_XMMWORD/8
    196 	movq	xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
    197 	pslldq	xmmA, SIZEOF_MMWORD
    198 	por	xmmA,xmmE
    199 .column_ld4:
    200 	test	cl, SIZEOF_XMMWORD/4
    201 	jz	short .column_ld8
    202 	sub	rcx, byte SIZEOF_XMMWORD/4
    203 	movdqa	xmmE,xmmA
    204 	movdqu	xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
    205 .column_ld8:
    206 	test	cl, SIZEOF_XMMWORD/2
    207 	mov	rcx, SIZEOF_XMMWORD
    208 	jz	short .rgb_gray_cnv
    209 	movdqa	xmmF,xmmA
    210 	movdqa	xmmH,xmmE
    211 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    212 	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    213 	jmp	short .rgb_gray_cnv
    214 
    215 .columnloop:
    216 	movdqu	xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    217 	movdqu	xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
    218 	movdqu	xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
    219 	movdqu	xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
    220 
    221 .rgb_gray_cnv:
    222 	; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
    223 	; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
    224 	; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
    225 	; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
    226 
    227 	movdqa    xmmD,xmmA
    228 	punpcklbw xmmA,xmmE	; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
    229 	punpckhbw xmmD,xmmE	; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
    230 
    231 	movdqa    xmmC,xmmF
    232 	punpcklbw xmmF,xmmH	; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
    233 	punpckhbw xmmC,xmmH	; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
    234 
    235 	movdqa    xmmB,xmmA
    236 	punpcklwd xmmA,xmmF	; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
    237 	punpckhwd xmmB,xmmF	; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
    238 
    239 	movdqa    xmmG,xmmD
    240 	punpcklwd xmmD,xmmC	; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
    241 	punpckhwd xmmG,xmmC	; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
    242 
    243 	movdqa    xmmE,xmmA
    244 	punpcklbw xmmA,xmmD	; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
    245 	punpckhbw xmmE,xmmD	; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
    246 
    247 	movdqa    xmmH,xmmB
    248 	punpcklbw xmmB,xmmG	; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
    249 	punpckhbw xmmH,xmmG	; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
    250 
    251 	pxor      xmmF,xmmF
    252 
    253 	movdqa    xmmC,xmmA
    254 	punpcklbw xmmA,xmmF	; xmmA=(00 02 04 06 08 0A 0C 0E)
    255 	punpckhbw xmmC,xmmF	; xmmC=(10 12 14 16 18 1A 1C 1E)
    256 
    257 	movdqa    xmmD,xmmB
    258 	punpcklbw xmmB,xmmF	; xmmB=(01 03 05 07 09 0B 0D 0F)
    259 	punpckhbw xmmD,xmmF	; xmmD=(11 13 15 17 19 1B 1D 1F)
    260 
    261 	movdqa    xmmG,xmmE
    262 	punpcklbw xmmE,xmmF	; xmmE=(20 22 24 26 28 2A 2C 2E)
    263 	punpckhbw xmmG,xmmF	; xmmG=(30 32 34 36 38 3A 3C 3E)
    264 
    265 	punpcklbw xmmF,xmmH
    266 	punpckhbw xmmH,xmmH
    267 	psrlw     xmmF,BYTE_BIT	; xmmF=(21 23 25 27 29 2B 2D 2F)
    268 	psrlw     xmmH,BYTE_BIT	; xmmH=(31 33 35 37 39 3B 3D 3F)
    269 
    270 %endif ; RGB_PIXELSIZE ; ---------------
    271 
    272 	; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
    273 	; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
    274 
    275 	; (Original)
    276 	; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    277 	;
    278 	; (This implementation)
    279 	; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    280 
    281 	movdqa    xmm6,xmm1
    282 	punpcklwd xmm1,xmm3
    283 	punpckhwd xmm6,xmm3
    284 	pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
    285 	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
    286 
    287 	movdqa    xmm7, xmm6	; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
    288 
    289 	movdqa    xmm6,xmm0
    290 	punpcklwd xmm0,xmm2
    291 	punpckhwd xmm6,xmm2
    292 	pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
    293 	pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
    294 
    295 	movdqa    XMMWORD [wk(0)], xmm0	; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
    296 	movdqa    XMMWORD [wk(1)], xmm6	; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
    297 
    298 	movdqa    xmm0, xmm5	; xmm0=BO
    299 	movdqa    xmm6, xmm4	; xmm6=BE
    300 
    301 	movdqa    xmm4,xmm0
    302 	punpcklwd xmm0,xmm3
    303 	punpckhwd xmm4,xmm3
    304 	pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
    305 	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
    306 
    307 	movdqa    xmm3,[rel PD_ONEHALF]	; xmm3=[PD_ONEHALF]
    308 
    309 	paddd     xmm0, xmm1
    310 	paddd     xmm4, xmm7
    311 	paddd     xmm0,xmm3
    312 	paddd     xmm4,xmm3
    313 	psrld     xmm0,SCALEBITS	; xmm0=YOL
    314 	psrld     xmm4,SCALEBITS	; xmm4=YOH
    315 	packssdw  xmm0,xmm4		; xmm0=YO
    316 
    317 	movdqa    xmm4,xmm6
    318 	punpcklwd xmm6,xmm2
    319 	punpckhwd xmm4,xmm2
    320 	pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
    321 	pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
    322 
    323 	movdqa    xmm2,[rel PD_ONEHALF]	; xmm2=[PD_ONEHALF]
    324 
    325 	paddd     xmm6, XMMWORD [wk(0)]
    326 	paddd     xmm4, XMMWORD [wk(1)]
    327 	paddd     xmm6,xmm2
    328 	paddd     xmm4,xmm2
    329 	psrld     xmm6,SCALEBITS	; xmm6=YEL
    330 	psrld     xmm4,SCALEBITS	; xmm4=YEH
    331 	packssdw  xmm6,xmm4		; xmm6=YE
    332 
    333 	psllw     xmm0,BYTE_BIT
    334 	por       xmm6,xmm0		; xmm6=Y
    335 	movdqa    XMMWORD [rdi], xmm6	; Save Y
    336 
    337 	sub	rcx, byte SIZEOF_XMMWORD
    338 	add	rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD	; inptr
    339 	add	rdi, byte SIZEOF_XMMWORD		; outptr0
    340 	cmp	rcx, byte SIZEOF_XMMWORD
    341 	jae	near .columnloop
    342 	test	rcx,rcx
    343 	jnz	near .column_ld1
    344 
    345 	pop	rcx			; col
    346 	pop	rsi
    347 	pop	rdi
    348 
    349 	add	rsi, byte SIZEOF_JSAMPROW	; input_buf
    350 	add	rdi, byte SIZEOF_JSAMPROW
    351 	dec	rax				; num_rows
    352 	jg	near .rowloop
    353 
    354 .return:
    355 	pop	rbx
    356 	uncollect_args
    357 	mov	rsp,rbp		; rsp <- aligned rbp
    358 	pop	rsp		; rsp <- original rbp
    359 	pop	rbp
    360 	ret
    361 
    362 ; For some reason, the OS X linker does not honor the request to align the
    363 ; segment unless we do this.
    364 	align	16
    365