Home | History | Annotate | Download | only in x86
      1 /*
      2  * (C) Copyright IBM Corporation 2004
      3  * All Rights Reserved.
      4  *
      5  * Permission is hereby granted, free of charge, to any person obtaining a
      6  * copy of this software and associated documentation files (the "Software"),
      7  * to deal in the Software without restriction, including without limitation
      8  * on the rights to use, copy, modify, merge, publish, distribute, sub
      9  * license, and/or sell copies of the Software, and to permit persons to whom
     10  * the Software is furnished to do so, subject to the following conditions:
     11  *
     12  * The above copyright notice and this permission notice (including the next
     13  * paragraph) shall be included in all copies or substantial portions of the
     14  * Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
     19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
     20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
     21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
     22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 
     25 /**
     26  * \file read_rgba_span_x86.S
     27  * Optimized routines to transfer pixel data from the framebuffer to a
     28  * buffer in main memory.
     29  *
     30  * \author Ian Romanick <idr (at) us.ibm.com>
     31  */
     32 
     33 	.file	"read_rgba_span_x86.S"
     34 #if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
     35 /* Kevin F. Quinn 2nd July 2006
     36  * Replaced data segment constants with text-segment instructions.
     37  */
     38 #define	LOAD_MASK(mvins,m1,m2) \
     39    	pushl	$0xff00ff00 ;\
     40    	pushl	$0xff00ff00 ;\
     41    	pushl	$0xff00ff00 ;\
     42    	pushl	$0xff00ff00 ;\
     43 	mvins	(%esp), m1	;\
     44    	pushl	$0x00ff0000 ;\
     45    	pushl	$0x00ff0000 ;\
     46    	pushl	$0x00ff0000 ;\
     47    	pushl	$0x00ff0000 ;\
     48 	mvins	(%esp), m2	;\
     49 	addl	$32, %esp
     50 
     51 /* I implemented these as macros because they appear in several places,
     52  * and I've tweaked them a number of times.  I got tired of changing every
     53  * place they appear. :)
     54  */
     55 
     56 #define DO_ONE_PIXEL() \
     57 	movl	(%ebx), %eax ; \
     58 	addl	$4, %ebx ; \
     59 	bswap	%eax          /* ARGB -> BGRA */ ; \
     60 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
     61 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
     62 	addl	$4, %ecx
     63 
     64 #define DO_ONE_LAST_PIXEL() \
     65 	movl	(%ebx), %eax ; \
     66 	bswap	%eax          /* ARGB -> BGRA */ ; \
     67 	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
     68 	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
     69 
     70 
     71 /**
     72  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
     73  *
     74  * \warning
     75  * This function assumes that the caller will issue the EMMS instruction
     76  * at the correct places.
     77  */
     78 
     79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
     80 #ifndef USE_DRICORE
     81 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
     82 #endif
     83 	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
     84 _generic_read_RGBA_span_BGRA8888_REV_MMX:
     85 	pushl	%ebx
     86 
     87 #ifdef USE_INNER_EMMS
     88 	emms
     89 #endif
     90 	LOAD_MASK(movq,%mm1,%mm2)
     91 
     92 	movl	8(%esp), %ebx	/* source pointer */
     93 	movl	16(%esp), %edx	/* number of pixels to copy */
     94 	movl	12(%esp), %ecx	/* destination pointer */
     95 
     96 	testl	%edx, %edx
     97 	jle	.L20		/* Bail if there's nothing to do. */
     98 
     99 	movl	%ebx, %eax
    100 
    101 	negl	%eax
    102 	sarl	$2, %eax
    103 	andl	$1, %eax
    104 	je	.L17
    105 
    106 	subl	%eax, %edx
    107 	DO_ONE_PIXEL()
    108 .L17:
    109 
    110 	/* Would it be faster to unroll this loop once and process 4 pixels
    111 	 * per pass, instead of just two?
    112 	 */
    113 
    114 	movl	%edx, %eax
    115 	shrl	%eax
    116 	jmp	.L18
    117 .L19:
    118 	movq	(%ebx), %mm0
    119 	addl	$8, %ebx
    120 
    121 	/* These 9 instructions do what PSHUFB (if there were such an
    122 	 * instruction) could do in 1. :(
    123 	 */
    124 
    125 	movq	%mm0, %mm3
    126 	movq	%mm0, %mm4
    127 
    128 	pand	%mm2, %mm3
    129 	psllq	$16, %mm4
    130 	psrlq	$16, %mm3
    131 	pand	%mm2, %mm4
    132 
    133 	pand	%mm1, %mm0
    134 	por	%mm4, %mm3
    135 	por	%mm3, %mm0
    136 
    137 	movq	%mm0, (%ecx)
    138 	addl	$8, %ecx
    139 	subl	$1, %eax
    140 .L18:
    141 	jne	.L19
    142 
    143 #ifdef USE_INNER_EMMS
    144 	emms
    145 #endif
    146 
    147 	/* At this point there are either 1 or 0 pixels remaining to be
    148 	 * converted.  Convert the last pixel, if needed.
    149 	 */
    150 
    151 	testl	$1, %edx
    152 	je	.L20
    153 
    154 	DO_ONE_LAST_PIXEL()
    155 
    156 .L20:
    157 	popl	%ebx
    158 	ret
    159 	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
    160 
    161 
    162 /**
    163  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
    164  * instructions are only actually used to read data from the framebuffer.
    165  * In practice, the speed-up is pretty small.
    166  *
    167  * \todo
    168  * Do some more testing and determine if there's any reason to have this
    169  * function in addition to the MMX version.
    170  *
    171  * \warning
    172  * This function assumes that the caller will issue the EMMS instruction
    173  * at the correct places.
    174  */
    175 
    176 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
    177 #ifndef USE_DRICORE
    178 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
    179 #endif
    180 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
    181 _generic_read_RGBA_span_BGRA8888_REV_SSE:
    182 	pushl	%esi
    183 	pushl	%ebx
    184 	pushl	%ebp
    185 
    186 #ifdef USE_INNER_EMMS
    187 	emms
    188 #endif
    189 
    190 	LOAD_MASK(movq,%mm1,%mm2)
    191 
    192 	movl	16(%esp), %ebx	/* source pointer */
    193 	movl	24(%esp), %edx	/* number of pixels to copy */
    194 	movl	20(%esp), %ecx	/* destination pointer */
    195 
    196 	testl	%edx, %edx
    197 	jle	.L35		/* Bail if there's nothing to do. */
    198 
    199 	movl	%esp, %ebp
    200 	subl	$16, %esp
    201 	andl	$0xfffffff0, %esp
    202 
    203 	movl	%ebx, %eax
    204 	movl	%edx, %esi
    205 
    206 	negl	%eax
    207 	andl	$15, %eax
    208 	sarl	$2, %eax
    209 	cmpl	%edx, %eax
    210 	cmovle	%eax, %esi
    211 
    212 	subl	%esi, %edx
    213 
    214 	testl	$1, %esi
    215 	je	.L32
    216 
    217 	DO_ONE_PIXEL()
    218 .L32:
    219 
    220 	testl	$2, %esi
    221 	je	.L31
    222 
    223 	movq	(%ebx), %mm0
    224 	addl	$8, %ebx
    225 
    226 	movq	%mm0, %mm3
    227 	movq	%mm0, %mm4
    228 
    229 	pand	%mm2, %mm3
    230 	psllq	$16, %mm4
    231 	psrlq	$16, %mm3
    232 	pand	%mm2, %mm4
    233 
    234 	pand	%mm1, %mm0
    235 	por	%mm4, %mm3
    236 	por	%mm3, %mm0
    237 
    238 	movq	%mm0, (%ecx)
    239 	addl	$8, %ecx
    240 .L31:
    241 
    242 	movl	%edx, %eax
    243 	shrl	$2, %eax
    244 	jmp	.L33
    245 .L34:
    246 	movaps	(%ebx), %xmm0
    247 	addl	$16, %ebx
    248 
    249 	/* This would be so much better if we could just move directly from
    250 	 * an SSE register to an MMX register.  Unfortunately, that
    251 	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
    252 	 * instruction.
    253 	 */
    254 
    255 	movaps	%xmm0, (%esp)
    256 	movq	(%esp), %mm0
    257 	movq	8(%esp), %mm5
    258 
    259 	movq	%mm0, %mm3
    260 	movq	%mm0, %mm4
    261 	movq	%mm5, %mm6
    262 	movq	%mm5, %mm7
    263 
    264 	pand	%mm2, %mm3
    265 	pand	%mm2, %mm6
    266 
    267 	psllq	$16, %mm4
    268 	psllq	$16, %mm7
    269 
    270 	psrlq	$16, %mm3
    271 	psrlq	$16, %mm6
    272 
    273 	pand	%mm2, %mm4
    274 	pand	%mm2, %mm7
    275 
    276 	pand	%mm1, %mm0
    277 	pand	%mm1, %mm5
    278 
    279 	por	%mm4, %mm3
    280 	por	%mm7, %mm6
    281 
    282 	por	%mm3, %mm0
    283 	por	%mm6, %mm5
    284 
    285 	movq	%mm0, (%ecx)
    286 	movq	%mm5, 8(%ecx)
    287 	addl	$16, %ecx
    288 
    289 	subl	$1, %eax
    290 .L33:
    291 	jne	.L34
    292 
    293 #ifdef USE_INNER_EMMS
    294 	emms
    295 #endif
    296 	movl	%ebp, %esp
    297 
    298 	/* At this point there are either [0, 3] pixels remaining to be
    299 	 * converted.
    300 	 */
    301 
    302 	testl	$2, %edx
    303 	je	.L36
    304 
    305 	movq	(%ebx), %mm0
    306 	addl	$8, %ebx
    307 
    308 	movq	%mm0, %mm3
    309 	movq	%mm0, %mm4
    310 
    311 	pand	%mm2, %mm3
    312 	psllq	$16, %mm4
    313 	psrlq	$16, %mm3
    314 	pand	%mm2, %mm4
    315 
    316 	pand	%mm1, %mm0
    317 	por	%mm4, %mm3
    318 	por	%mm3, %mm0
    319 
    320 	movq	%mm0, (%ecx)
    321 	addl	$8, %ecx
    322 .L36:
    323 
    324 	testl	$1, %edx
    325 	je	.L35
    326 
    327 	DO_ONE_LAST_PIXEL()
    328 .L35:
    329 	popl	%ebp
    330 	popl	%ebx
    331 	popl	%esi
    332 	ret
    333 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
    334 
    335 
    336 /**
    337  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
    338  */
    339 
    340 	.text
    341 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
    342 #ifndef USE_DRICORE
    343 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
    344 #endif
    345 	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
    346 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
    347 	pushl	%esi
    348 	pushl	%ebx
    349 
    350 	LOAD_MASK(movdqu,%xmm1,%xmm2)
    351 
    352 	movl	12(%esp), %ebx	/* source pointer */
    353 	movl	20(%esp), %edx	/* number of pixels to copy */
    354 	movl	16(%esp), %ecx	/* destination pointer */
    355 
    356 	movl	%ebx, %eax
    357 	movl	%edx, %esi
    358 
    359 	testl	%edx, %edx
    360 	jle	.L46		/* Bail if there's nothing to do. */
    361 
    362 	/* If the source pointer isn't a multiple of 16 we have to process
    363 	 * a few pixels the "slow" way to get the address aligned for
    364 	 * the SSE fetch intsructions.
    365 	 */
    366 
    367 	negl	%eax
    368 	andl	$15, %eax
    369 	sarl	$2, %eax
    370 
    371 	cmpl	%edx, %eax
    372 	cmovbe	%eax, %esi
    373 	subl	%esi, %edx
    374 
    375 	testl	$1, %esi
    376 	je	.L41
    377 
    378 	DO_ONE_PIXEL()
    379 .L41:
    380 	testl	$2, %esi
    381 	je	.L40
    382 
    383 	movq	(%ebx), %xmm0
    384 	addl	$8, %ebx
    385 
    386 	movdqa	%xmm0, %xmm3
    387 	movdqa	%xmm0, %xmm4
    388 	andps	%xmm1, %xmm0
    389 
    390 	andps	%xmm2, %xmm3
    391 	pslldq	$2, %xmm4
    392 	psrldq	$2, %xmm3
    393 	andps	%xmm2, %xmm4
    394 
    395 	orps	%xmm4, %xmm3
    396 	orps	%xmm3, %xmm0
    397 
    398 	movq	%xmm0, (%ecx)
    399 	addl	$8, %ecx
    400 .L40:
    401 
    402 	/* Would it be worth having a specialized version of this loop for
    403 	 * the case where the destination is 16-byte aligned?  That version
    404 	 * would be identical except that it could use movedqa instead of
    405 	 * movdqu.
    406 	 */
    407 
    408 	movl	%edx, %eax
    409 	shrl	$2, %eax
    410 	jmp	.L42
    411 .L43:
    412 	movdqa	(%ebx), %xmm0
    413 	addl	$16, %ebx
    414 
    415 	movdqa	%xmm0, %xmm3
    416 	movdqa	%xmm0, %xmm4
    417 	andps	%xmm1, %xmm0
    418 
    419 	andps	%xmm2, %xmm3
    420 	pslldq	$2, %xmm4
    421 	psrldq	$2, %xmm3
    422 	andps	%xmm2, %xmm4
    423 
    424 	orps	%xmm4, %xmm3
    425 	orps	%xmm3, %xmm0
    426 
    427 	movdqu	%xmm0, (%ecx)
    428 	addl	$16, %ecx
    429 	subl	$1, %eax
    430 .L42:
    431 	jne	.L43
    432 
    433 
    434 	/* There may be upto 3 pixels remaining to be copied.  Take care
    435 	 * of them now.  We do the 2 pixel case first because the data
    436 	 * will be aligned.
    437 	 */
    438 
    439 	testl	$2, %edx
    440 	je	.L47
    441 
    442 	movq	(%ebx), %xmm0
    443 	addl	$8, %ebx
    444 
    445 	movdqa	%xmm0, %xmm3
    446 	movdqa	%xmm0, %xmm4
    447 	andps	%xmm1, %xmm0
    448 
    449 	andps	%xmm2, %xmm3
    450 	pslldq	$2, %xmm4
    451 	psrldq	$2, %xmm3
    452 	andps	%xmm2, %xmm4
    453 
    454 	orps	%xmm4, %xmm3
    455 	orps	%xmm3, %xmm0
    456 
    457 	movq	%xmm0, (%ecx)
    458 	addl	$8, %ecx
    459 .L47:
    460 
    461 	testl	$1, %edx
    462 	je	.L46
    463 
    464 	DO_ONE_LAST_PIXEL()
    465 .L46:
    466 
    467 	popl	%ebx
    468 	popl	%esi
    469 	ret
    470 	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
    471 
    472 
    473 
    474 #define MASK_565_L	0x07e0f800
    475 #define MASK_565_H	0x0000001f
    476 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
    477  * classic C implementation in Mesa.  Setting SCALE_ADJUST
    478  * to 0 is slightly faster but at a small cost to accuracy.
    479  */
    480 #define SCALE_ADJUST	5
    481 #if SCALE_ADJUST == 5
    482 #define PRESCALE_L 0x00100001
    483 #define PRESCALE_H 0x00000200
    484 #define SCALE_L 0x40C620E8
    485 #define SCALE_H 0x0000839d
    486 #elif SCALE_ADJUST == 0
    487 #define PRESCALE_L 0x00200001
    488 #define PRESCALE_H 0x00000800
    489 #define SCALE_L 0x01040108
    490 #define SCALE_H 0x00000108
    491 #else
    492 #error SCALE_ADJUST must either be 5 or 0.
    493 #endif
    494 #define ALPHA_L 0x00000000
    495 #define ALPHA_H 0x00ff0000
    496 
    497 /**
    498  * MMX optimized version of the RGB565 to RGBA copy routine.
    499  */
    500 
    501 	.text
    502 	.globl	_generic_read_RGBA_span_RGB565_MMX
    503 #ifndef USE_DRICORE
    504         .hidden _generic_read_RGBA_span_RGB565_MMX
    505 #endif
    506 	.type	_generic_read_RGBA_span_RGB565_MMX, @function
    507 
    508 _generic_read_RGBA_span_RGB565_MMX:
    509 
    510 #ifdef USE_INNER_EMMS
    511 	emms
    512 #endif
    513 
    514 	movl	4(%esp), %eax	/* source pointer */
    515 	movl	8(%esp), %edx	/* destination pointer */
    516 	movl	12(%esp), %ecx	/* number of pixels to copy */
    517 
    518 	pushl	$MASK_565_H
    519 	pushl	$MASK_565_L
    520 	movq	(%esp), %mm5
    521 	pushl	$PRESCALE_H
    522 	pushl	$PRESCALE_L
    523 	movq	(%esp), %mm6
    524 	pushl	$SCALE_H
    525 	pushl	$SCALE_L
    526 	movq	(%esp), %mm7
    527 	pushl	$ALPHA_H
    528 	pushl	$ALPHA_L
    529 	movq	(%esp), %mm3
    530 	addl	$32,%esp
    531 
    532 	sarl	$2, %ecx
    533 	jl	.L01		/* Bail early if the count is negative. */
    534 	jmp	.L02
    535 
    536 .L03:
    537 	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
    538 	 * second pixels into the four words of %mm0 and %mm2.
    539       	 */
    540 
    541 	movq	(%eax), %mm4
    542 	addl	$8, %eax
    543 
    544 	pshufw	$0x00, %mm4, %mm0
    545 	pshufw	$0x55, %mm4, %mm2
    546 
    547 
    548 	/* Mask the pixels so that each word of each register contains only
    549 	 * one color component.
    550 	 */
    551 
    552 	pand	%mm5, %mm0
    553 	pand	%mm5, %mm2
    554 
    555 
    556 	/* Adjust the component values so that they are as small as possible,
    557 	 * but large enough so that we can multiply them by an unsigned 16-bit
    558 	 * number and get a value as large as 0x00ff0000.
    559  	 */
    560 
    561 	pmullw	%mm6, %mm0
    562 	pmullw	%mm6, %mm2
    563 #if SCALE_ADJUST > 0
    564 	psrlw	$SCALE_ADJUST, %mm0
    565 	psrlw	$SCALE_ADJUST, %mm2
    566 #endif
    567 
    568 	/* Scale the input component values to be on the range
    569 	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
    570 	 */
    571 
    572 	pmulhuw	%mm7, %mm0
    573 	pmulhuw	%mm7, %mm2
    574 
    575 
    576 	/* Always set the alpha value to 0xff.
    577 	 */
    578 
    579  	por %mm3, %mm0
    580  	por %mm3, %mm2
    581 
    582 
    583 	/* Pack the 16-bit values to 8-bit values and store the converted
    584 	 * pixel data.
    585 	 */
    586 
    587 	packuswb	%mm2, %mm0
    588 	movq	%mm0, (%edx)
    589 	addl	$8, %edx
    590 
    591 	pshufw	$0xaa, %mm4, %mm0
    592 	pshufw	$0xff, %mm4, %mm2
    593 
    594 	pand	%mm5, %mm0
    595 	pand	%mm5, %mm2
    596 	pmullw	%mm6, %mm0
    597 	pmullw	%mm6, %mm2
    598 #if SCALE_ADJUST > 0
    599 	psrlw	$SCALE_ADJUST, %mm0
    600 	psrlw	$SCALE_ADJUST, %mm2
    601 #endif
    602 	pmulhuw	%mm7, %mm0
    603 	pmulhuw	%mm7, %mm2
    604 
    605  	por %mm3, %mm0
    606  	por %mm3, %mm2
    607 
    608 	packuswb	%mm2, %mm0
    609 
    610 	movq	%mm0, (%edx)
    611 	addl	$8, %edx
    612 
    613 	subl	$1, %ecx
    614 .L02:
    615 	jne	.L03
    616 
    617 
    618 	/* At this point there can be at most 3 pixels left to process.  If
    619 	 * there is either 2 or 3 left, process 2.
    620          */
    621 
    622 	movl	12(%esp), %ecx
    623 	testl	$0x02, %ecx
    624 	je	.L04
    625 
    626 	movd	(%eax), %mm4
    627 	addl	$4, %eax
    628 
    629 	pshufw	$0x00, %mm4, %mm0
    630 	pshufw	$0x55, %mm4, %mm2
    631 
    632 	pand	%mm5, %mm0
    633 	pand	%mm5, %mm2
    634 	pmullw	%mm6, %mm0
    635 	pmullw	%mm6, %mm2
    636 #if SCALE_ADJUST > 0
    637 	psrlw	$SCALE_ADJUST, %mm0
    638 	psrlw	$SCALE_ADJUST, %mm2
    639 #endif
    640 	pmulhuw	%mm7, %mm0
    641 	pmulhuw	%mm7, %mm2
    642 
    643  	por %mm3, %mm0
    644  	por %mm3, %mm2
    645 
    646 	packuswb	%mm2, %mm0
    647 
    648 	movq	%mm0, (%edx)
    649 	addl	$8, %edx
    650 
    651 .L04:
    652 	/* At this point there can be at most 1 pixel left to process.
    653 	 * Process it if needed.
    654          */
    655 
    656 	testl	$0x01, %ecx
    657 	je	.L01
    658 
    659 	movzwl	(%eax), %ecx
    660 	movd	%ecx, %mm4
    661 
    662 	pshufw	$0x00, %mm4, %mm0
    663 
    664 	pand	%mm5, %mm0
    665 	pmullw	%mm6, %mm0
    666 #if SCALE_ADJUST > 0
    667 	psrlw	$SCALE_ADJUST, %mm0
    668 #endif
    669 	pmulhuw	%mm7, %mm0
    670 
    671  	por %mm3, %mm0
    672 
    673 	packuswb	%mm0, %mm0
    674 
    675 	movd	%mm0, (%edx)
    676 
    677 .L01:
    678 #ifdef USE_INNER_EMMS
    679 	emms
    680 #endif
    681 	ret
    682 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
    683 
    684 #if defined (__ELF__) && defined (__linux__)
    685 	.section .note.GNU-stack,"",%progbits
    686 #endif
    687