Home | History | Annotate | Download | only in x86
      1 /*
      2  * (C) Copyright IBM Corporation 2004
      3  * All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * on the rights to use, copy, modify, merge, publish, distribute, sub
      9  * license, and/or sell copies of the Software, and to permit persons to whom
     10  * the Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
     19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 
     25 /**
     26  * \file read_rgba_span_x86.S
     27  * Optimized routines to transfer pixel data from the framebuffer to a
     28  * buffer in main memory.
     29  *
     30  * \author Ian Romanick <idr (at) us.ibm.com>
     31  */
     32 
     33 	.file	"read_rgba_span_x86.S"
     34 #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
     35 /* Kevin F. Quinn 2nd July 2006
     36  * Replaced data segment constants with text-segment instructions.
     37  */
     38 #define	LOAD_MASK(mvins,m1,m2) \
     39    	pushl	$0xff00ff00 ;\
     40    	pushl	$0xff00ff00 ;\
     41    	pushl	$0xff00ff00 ;\
     42    	pushl	$0xff00ff00 ;\
     43 	mvins	(%esp), m1	;\
     44    	pushl	$0x00ff0000 ;\
     45    	pushl	$0x00ff0000 ;\
     46    	pushl	$0x00ff0000 ;\
     47    	pushl	$0x00ff0000 ;\
     48 	mvins	(%esp), m2	;\
     49 	addl	$32, %esp
     50 
     51 /* I implemented these as macros because they appear in several places,
     52  * and I've tweaked them a number of times.  I got tired of changing every
     53  * place they appear. :)
     54  */
     55 
     56 #define DO_ONE_PIXEL() \
     57 	movl	(%ebx), %eax ; \
     58 	addl	$4, %ebx ; \
     59 	bswap	%eax          /* ARGB -> BGRA */ ; \
     60 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
     61 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
     62 	addl	$4, %ecx
     63 
     64 #define DO_ONE_LAST_PIXEL() \
     65 	movl	(%ebx), %eax ; \
     66 	bswap	%eax          /* ARGB -> BGRA */ ; \
     67 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
     68 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
     69 
     70 
     71 /**
     72  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
     73  *
     74  * \warning
     75  * This function assumes that the caller will issue the EMMS instruction
     76  * at the correct places.
     77  */
     78 
     79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
     80 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
     81 	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
     82 _generic_read_RGBA_span_BGRA8888_REV_MMX:
     83 	pushl	%ebx
     84 
     85 #ifdef USE_INNER_EMMS
     86 	emms
     87 #endif
     88 	LOAD_MASK(movq,%mm1,%mm2)
     89 
     90 	movl	8(%esp), %ebx	/* source pointer */
     91 	movl	16(%esp), %edx	/* number of pixels to copy */
     92 	movl	12(%esp), %ecx	/* destination pointer */
     93 
     94 	testl	%edx, %edx
     95 	jle	.L20		/* Bail if there's nothing to do. */
     96 
     97 	movl	%ebx, %eax
     98 
     99 	negl	%eax
    100 	sarl	$2, %eax
    101 	andl	$1, %eax
    102 	je	.L17
    103 
    104 	subl	%eax, %edx
    105 	DO_ONE_PIXEL()
    106 .L17:
    107 
    108 	/* Would it be faster to unroll this loop once and process 4 pixels
    109 	 * per pass, instead of just two?
    110 	 */
    111 
    112 	movl	%edx, %eax
    113 	shrl	%eax
    114 	jmp	.L18
    115 .L19:
    116 	movq	(%ebx), %mm0
    117 	addl	$8, %ebx
    118 
    119 	/* These 9 instructions do what PSHUFB (if there were such an
    120 	 * instruction) could do in 1. :(
    121 	 */
    122 
    123 	movq	%mm0, %mm3
    124 	movq	%mm0, %mm4
    125 
    126 	pand	%mm2, %mm3
    127 	psllq	$16, %mm4
    128 	psrlq	$16, %mm3
    129 	pand	%mm2, %mm4
    130 
    131 	pand	%mm1, %mm0
    132 	por	%mm4, %mm3
    133 	por	%mm3, %mm0
    134 
    135 	movq	%mm0, (%ecx)
    136 	addl	$8, %ecx
    137 	subl	$1, %eax
    138 .L18:
    139 	jne	.L19
    140 
    141 #ifdef USE_INNER_EMMS
    142 	emms
    143 #endif
    144 
    145 	/* At this point there are either 1 or 0 pixels remaining to be
    146 	 * converted.  Convert the last pixel, if needed.
    147 	 */
    148 
    149 	testl	$1, %edx
    150 	je	.L20
    151 
    152 	DO_ONE_LAST_PIXEL()
    153 
    154 .L20:
    155 	popl	%ebx
    156 	ret
    157 	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
    158 
    159 
    160 /**
    161  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
    162  * instructions are only actually used to read data from the framebuffer.
    163  * In practice, the speed-up is pretty small.
    164  *
    165  * \todo
    166  * Do some more testing and determine if there's any reason to have this
    167  * function in addition to the MMX version.
    168  *
    169  * \warning
    170  * This function assumes that the caller will issue the EMMS instruction
    171  * at the correct places.
    172  */
    173 
    174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
    175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
    176 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
    177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
    178 	pushl	%esi
    179 	pushl	%ebx
    180 	pushl	%ebp
    181 
    182 #ifdef USE_INNER_EMMS
    183 	emms
    184 #endif
    185 
    186 	LOAD_MASK(movq,%mm1,%mm2)
    187 
    188 	movl	16(%esp), %ebx	/* source pointer */
    189 	movl	24(%esp), %edx	/* number of pixels to copy */
    190 	movl	20(%esp), %ecx	/* destination pointer */
    191 
    192 	testl	%edx, %edx
    193 	jle	.L35		/* Bail if there's nothing to do. */
    194 
    195 	movl	%esp, %ebp
    196 	subl	$16, %esp
    197 	andl	$0xfffffff0, %esp
    198 
    199 	movl	%ebx, %eax
    200 	movl	%edx, %esi
    201 
    202 	negl	%eax
    203 	andl	$15, %eax
    204 	sarl	$2, %eax
    205 	cmpl	%edx, %eax
    206 	cmovle	%eax, %esi
    207 
    208 	subl	%esi, %edx
    209 
    210 	testl	$1, %esi
    211 	je	.L32
    212 
    213 	DO_ONE_PIXEL()
    214 .L32:
    215 
    216 	testl	$2, %esi
    217 	je	.L31
    218 
    219 	movq	(%ebx), %mm0
    220 	addl	$8, %ebx
    221 
    222 	movq	%mm0, %mm3
    223 	movq	%mm0, %mm4
    224 
    225 	pand	%mm2, %mm3
    226 	psllq	$16, %mm4
    227 	psrlq	$16, %mm3
    228 	pand	%mm2, %mm4
    229 
    230 	pand	%mm1, %mm0
    231 	por	%mm4, %mm3
    232 	por	%mm3, %mm0
    233 
    234 	movq	%mm0, (%ecx)
    235 	addl	$8, %ecx
    236 .L31:
    237 
    238 	movl	%edx, %eax
    239 	shrl	$2, %eax
    240 	jmp	.L33
    241 .L34:
    242 	movaps	(%ebx), %xmm0
    243 	addl	$16, %ebx
    244 
    245 	/* This would be so much better if we could just move directly from
    246 	 * an SSE register to an MMX register.  Unfortunately, that
    247 	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
    248 	 * instruction.
    249 	 */
    250 
    251 	movaps	%xmm0, (%esp)
    252 	movq	(%esp), %mm0
    253 	movq	8(%esp), %mm5
    254 
    255 	movq	%mm0, %mm3
    256 	movq	%mm0, %mm4
    257 	movq	%mm5, %mm6
    258 	movq	%mm5, %mm7
    259 
    260 	pand	%mm2, %mm3
    261 	pand	%mm2, %mm6
    262 
    263 	psllq	$16, %mm4
    264 	psllq	$16, %mm7
    265 
    266 	psrlq	$16, %mm3
    267 	psrlq	$16, %mm6
    268 
    269 	pand	%mm2, %mm4
    270 	pand	%mm2, %mm7
    271 
    272 	pand	%mm1, %mm0
    273 	pand	%mm1, %mm5
    274 
    275 	por	%mm4, %mm3
    276 	por	%mm7, %mm6
    277 
    278 	por	%mm3, %mm0
    279 	por	%mm6, %mm5
    280 
    281 	movq	%mm0, (%ecx)
    282 	movq	%mm5, 8(%ecx)
    283 	addl	$16, %ecx
    284 
    285 	subl	$1, %eax
    286 .L33:
    287 	jne	.L34
    288 
    289 #ifdef USE_INNER_EMMS
    290 	emms
    291 #endif
    292 	movl	%ebp, %esp
    293 
    294 	/* At this point there are either [0, 3] pixels remaining to be
    295 	 * converted.
    296 	 */
    297 
    298 	testl	$2, %edx
    299 	je	.L36
    300 
    301 	movq	(%ebx), %mm0
    302 	addl	$8, %ebx
    303 
    304 	movq	%mm0, %mm3
    305 	movq	%mm0, %mm4
    306 
    307 	pand	%mm2, %mm3
    308 	psllq	$16, %mm4
    309 	psrlq	$16, %mm3
    310 	pand	%mm2, %mm4
    311 
    312 	pand	%mm1, %mm0
    313 	por	%mm4, %mm3
    314 	por	%mm3, %mm0
    315 
    316 	movq	%mm0, (%ecx)
    317 	addl	$8, %ecx
    318 .L36:
    319 
    320 	testl	$1, %edx
    321 	je	.L35
    322 
    323 	DO_ONE_LAST_PIXEL()
    324 .L35:
    325 	popl	%ebp
    326 	popl	%ebx
    327 	popl	%esi
    328 	ret
    329 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
    330 
    331 
    332 /**
    333  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
    334  */
    335 
    336 	.text
    337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
    338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
    339 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
    340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
    341 	pushl	%esi
    342 	pushl	%ebx
    343 
    344 	LOAD_MASK(movdqu,%xmm1,%xmm2)
    345 
    346 	movl	12(%esp), %ebx	/* source pointer */
    347 	movl	20(%esp), %edx	/* number of pixels to copy */
    348 	movl	16(%esp), %ecx	/* destination pointer */
    349 
    350 	movl	%ebx, %eax
    351 	movl	%edx, %esi
    352 
    353 	testl	%edx, %edx
    354 	jle	.L46		/* Bail if there's nothing to do. */
    355 
    356 	/* If the source pointer isn't a multiple of 16 we have to process
    357 	 * a few pixels the "slow" way to get the address aligned for
    358 	 * the SSE fetch intsructions.
    359 	 */
    360 
    361 	negl	%eax
    362 	andl	$15, %eax
    363 	sarl	$2, %eax
    364 
    365 	cmpl	%edx, %eax
    366 	cmovbe	%eax, %esi
    367 	subl	%esi, %edx
    368 
    369 	testl	$1, %esi
    370 	je	.L41
    371 
    372 	DO_ONE_PIXEL()
    373 .L41:
    374 	testl	$2, %esi
    375 	je	.L40
    376 
    377 	movq	(%ebx), %xmm0
    378 	addl	$8, %ebx
    379 
    380 	movdqa	%xmm0, %xmm3
    381 	movdqa	%xmm0, %xmm4
    382 	andps	%xmm1, %xmm0
    383 
    384 	andps	%xmm2, %xmm3
    385 	pslldq	$2, %xmm4
    386 	psrldq	$2, %xmm3
    387 	andps	%xmm2, %xmm4
    388 
    389 	orps	%xmm4, %xmm3
    390 	orps	%xmm3, %xmm0
    391 
    392 	movq	%xmm0, (%ecx)
    393 	addl	$8, %ecx
    394 .L40:
    395 
    396 	/* Would it be worth having a specialized version of this loop for
    397 	 * the case where the destination is 16-byte aligned?  That version
    398 	 * would be identical except that it could use movedqa instead of
    399 	 * movdqu.
    400 	 */
    401 
    402 	movl	%edx, %eax
    403 	shrl	$2, %eax
    404 	jmp	.L42
    405 .L43:
    406 	movdqa	(%ebx), %xmm0
    407 	addl	$16, %ebx
    408 
    409 	movdqa	%xmm0, %xmm3
    410 	movdqa	%xmm0, %xmm4
    411 	andps	%xmm1, %xmm0
    412 
    413 	andps	%xmm2, %xmm3
    414 	pslldq	$2, %xmm4
    415 	psrldq	$2, %xmm3
    416 	andps	%xmm2, %xmm4
    417 
    418 	orps	%xmm4, %xmm3
    419 	orps	%xmm3, %xmm0
    420 
    421 	movdqu	%xmm0, (%ecx)
    422 	addl	$16, %ecx
    423 	subl	$1, %eax
    424 .L42:
    425 	jne	.L43
    426 
    427 
    428 	/* There may be upto 3 pixels remaining to be copied.  Take care
    429 	 * of them now.  We do the 2 pixel case first because the data
    430 	 * will be aligned.
    431 	 */
    432 
    433 	testl	$2, %edx
    434 	je	.L47
    435 
    436 	movq	(%ebx), %xmm0
    437 	addl	$8, %ebx
    438 
    439 	movdqa	%xmm0, %xmm3
    440 	movdqa	%xmm0, %xmm4
    441 	andps	%xmm1, %xmm0
    442 
    443 	andps	%xmm2, %xmm3
    444 	pslldq	$2, %xmm4
    445 	psrldq	$2, %xmm3
    446 	andps	%xmm2, %xmm4
    447 
    448 	orps	%xmm4, %xmm3
    449 	orps	%xmm3, %xmm0
    450 
    451 	movq	%xmm0, (%ecx)
    452 	addl	$8, %ecx
    453 .L47:
    454 
    455 	testl	$1, %edx
    456 	je	.L46
    457 
    458 	DO_ONE_LAST_PIXEL()
    459 .L46:
    460 
    461 	popl	%ebx
    462 	popl	%esi
    463 	ret
    464 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
    465 
    466 
    467 
    468 #define MASK_565_L	0x07e0f800
    469 #define MASK_565_H	0x0000001f
    470 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
    471  * classic C implementation in Mesa.  Setting SCALE_ADJUST
    472  * to 0 is slightly faster but at a small cost to accuracy.
    473  */
    474 #define SCALE_ADJUST	5
    475 #if SCALE_ADJUST == 5
    476 #define PRESCALE_L 0x00100001
    477 #define PRESCALE_H 0x00000200
    478 #define SCALE_L 0x40C620E8
    479 #define SCALE_H 0x0000839d
    480 #elif SCALE_ADJUST == 0
    481 #define PRESCALE_L 0x00200001
    482 #define PRESCALE_H 0x00000800
    483 #define SCALE_L 0x01040108
    484 #define SCALE_H 0x00000108
    485 #else
    486 #error SCALE_ADJUST must either be 5 or 0.
    487 #endif
    488 #define ALPHA_L 0x00000000
    489 #define ALPHA_H 0x00ff0000
    490 
    491 /**
    492  * MMX optimized version of the RGB565 to RGBA copy routine.
    493  */
    494 
    495 	.text
    496 	.globl	_generic_read_RGBA_span_RGB565_MMX
    497         .hidden _generic_read_RGBA_span_RGB565_MMX
    498 	.type	_generic_read_RGBA_span_RGB565_MMX, @function
    499 
    500 _generic_read_RGBA_span_RGB565_MMX:
    501 
    502 #ifdef USE_INNER_EMMS
    503 	emms
    504 #endif
    505 
    506 	movl	4(%esp), %eax	/* source pointer */
    507 	movl	8(%esp), %edx	/* destination pointer */
    508 	movl	12(%esp), %ecx	/* number of pixels to copy */
    509 
    510 	pushl	$MASK_565_H
    511 	pushl	$MASK_565_L
    512 	movq	(%esp), %mm5
    513 	pushl	$PRESCALE_H
    514 	pushl	$PRESCALE_L
    515 	movq	(%esp), %mm6
    516 	pushl	$SCALE_H
    517 	pushl	$SCALE_L
    518 	movq	(%esp), %mm7
    519 	pushl	$ALPHA_H
    520 	pushl	$ALPHA_L
    521 	movq	(%esp), %mm3
    522 	addl	$32,%esp
    523 
    524 	sarl	$2, %ecx
    525 	jl	.L01		/* Bail early if the count is negative. */
    526 	jmp	.L02
    527 
    528 .L03:
    529 	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
    530 	 * second pixels into the four words of %mm0 and %mm2.
    531       	 */
    532 
    533 	movq	(%eax), %mm4
    534 	addl	$8, %eax
    535 
    536 	pshufw	$0x00, %mm4, %mm0
    537 	pshufw	$0x55, %mm4, %mm2
    538 
    539 
    540 	/* Mask the pixels so that each word of each register contains only
    541 	 * one color component.
    542 	 */
    543 
    544 	pand	%mm5, %mm0
    545 	pand	%mm5, %mm2
    546 
    547 
    548 	/* Adjust the component values so that they are as small as possible,
    549 	 * but large enough so that we can multiply them by an unsigned 16-bit
    550 	 * number and get a value as large as 0x00ff0000.
    551  	 */
    552 
    553 	pmullw	%mm6, %mm0
    554 	pmullw	%mm6, %mm2
    555 #if SCALE_ADJUST > 0
    556 	psrlw	$SCALE_ADJUST, %mm0
    557 	psrlw	$SCALE_ADJUST, %mm2
    558 #endif
    559 
    560 	/* Scale the input component values to be on the range
    561 	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
    562 	 */
    563 
    564 	pmulhuw	%mm7, %mm0
    565 	pmulhuw	%mm7, %mm2
    566 
    567 
    568 	/* Always set the alpha value to 0xff.
    569 	 */
    570 
    571  	por %mm3, %mm0
    572  	por %mm3, %mm2
    573 
    574 
    575 	/* Pack the 16-bit values to 8-bit values and store the converted
    576 	 * pixel data.
    577 	 */
    578 
    579 	packuswb	%mm2, %mm0
    580 	movq	%mm0, (%edx)
    581 	addl	$8, %edx
    582 
    583 	pshufw	$0xaa, %mm4, %mm0
    584 	pshufw	$0xff, %mm4, %mm2
    585 
    586 	pand	%mm5, %mm0
    587 	pand	%mm5, %mm2
    588 	pmullw	%mm6, %mm0
    589 	pmullw	%mm6, %mm2
    590 #if SCALE_ADJUST > 0
    591 	psrlw	$SCALE_ADJUST, %mm0
    592 	psrlw	$SCALE_ADJUST, %mm2
    593 #endif
    594 	pmulhuw	%mm7, %mm0
    595 	pmulhuw	%mm7, %mm2
    596 
    597  	por %mm3, %mm0
    598  	por %mm3, %mm2
    599 
    600 	packuswb	%mm2, %mm0
    601 
    602 	movq	%mm0, (%edx)
    603 	addl	$8, %edx
    604 
    605 	subl	$1, %ecx
    606 .L02:
    607 	jne	.L03
    608 
    609 
    610 	/* At this point there can be at most 3 pixels left to process.  If
    611 	 * there is either 2 or 3 left, process 2.
    612          */
    613 
    614 	movl	12(%esp), %ecx
    615 	testl	$0x02, %ecx
    616 	je	.L04
    617 
    618 	movd	(%eax), %mm4
    619 	addl	$4, %eax
    620 
    621 	pshufw	$0x00, %mm4, %mm0
    622 	pshufw	$0x55, %mm4, %mm2
    623 
    624 	pand	%mm5, %mm0
    625 	pand	%mm5, %mm2
    626 	pmullw	%mm6, %mm0
    627 	pmullw	%mm6, %mm2
    628 #if SCALE_ADJUST > 0
    629 	psrlw	$SCALE_ADJUST, %mm0
    630 	psrlw	$SCALE_ADJUST, %mm2
    631 #endif
    632 	pmulhuw	%mm7, %mm0
    633 	pmulhuw	%mm7, %mm2
    634 
    635  	por %mm3, %mm0
    636  	por %mm3, %mm2
    637 
    638 	packuswb	%mm2, %mm0
    639 
    640 	movq	%mm0, (%edx)
    641 	addl	$8, %edx
    642 
    643 .L04:
    644 	/* At this point there can be at most 1 pixel left to process.
    645 	 * Process it if needed.
    646          */
    647 
    648 	testl	$0x01, %ecx
    649 	je	.L01
    650 
    651 	movzwl	(%eax), %ecx
    652 	movd	%ecx, %mm4
    653 
    654 	pshufw	$0x00, %mm4, %mm0
    655 
    656 	pand	%mm5, %mm0
    657 	pmullw	%mm6, %mm0
    658 #if SCALE_ADJUST > 0
    659 	psrlw	$SCALE_ADJUST, %mm0
    660 #endif
    661 	pmulhuw	%mm7, %mm0
    662 
    663  	por %mm3, %mm0
    664 
    665 	packuswb	%mm0, %mm0
    666 
    667 	movd	%mm0, (%edx)
    668 
    669 .L01:
    670 #ifdef USE_INNER_EMMS
    671 	emms
    672 #endif
    673 	ret
    674 #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
    675 
    676 #if defined (__ELF__) && defined (__linux__)
    677 	.section .note.GNU-stack,"",%progbits
    678 #endif
    679