Home | History | Annotate | Download | only in x86-64
      1 /*
      2  * Mesa 3-D graphics library
      3  *
      4  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a
      7  * copy of this software and associated documentation files (the "Software"),
      8  * to deal in the Software without restriction, including without limitation
      9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     10  * and/or sell copies of the Software, and to permit persons to whom the
     11  * Software is furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included
     14  * in all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     22  * OTHER DEALINGS IN THE SOFTWARE.
     23  */
     24 
     25 #ifdef USE_X86_64_ASM
     26 
     27 #include "matypes.h"
     28 
     29 .text
     30 
     31 .align 16
     32 .globl _mesa_x86_64_cpuid
     33 .hidden _mesa_x86_64_cpuid
     34 _mesa_x86_64_cpuid:
     35 	pushq	%rbx
     36 	movl	(%rdi), %eax
     37 	movl	8(%rdi), %ecx
     38 
     39 	cpuid
     40 
     41 	movl	%ebx, 4(%rdi)
     42 	movl	%eax, (%rdi)
     43 	movl	%ecx, 8(%rdi)
     44 	movl	%edx, 12(%rdi)
     45 	popq	%rbx
     46 	ret
     47 
     48 .align 16
     49 .globl _mesa_x86_64_transform_points4_general
     50 .hidden _mesa_x86_64_transform_points4_general
     51 _mesa_x86_64_transform_points4_general:
     52 /*
     53  *	rdi = dest
     54  *	rsi = matrix
     55  *	rdx = source
     56  */
     57 	movl V4F_COUNT(%rdx), %ecx	/* count */
     58 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
     59 
     60 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
     61 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
     62 	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
     63 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
     64 
     65 	testl %ecx, %ecx		/* verify non-zero count */
     66 	prefetchnta 64(%rsi)
     67 	jz p4_general_done
     68 
     69 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
     70 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
     71 
     72 	prefetcht1 16(%rdx)
     73 
     74 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
     75 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
     76 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
     77 	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
     78         movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
     79 
     80 p4_general_loop:
     81 
     82 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
     83 	prefetcht1 16(%rdi)
     84 
     85 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
     86 	addq %rax, %rdx
     87 	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
     88 	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
     89 	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
     90 	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
     91 	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
     92 	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
     93 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
     94 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
     95 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
     96 	prefetcht1 16(%rdx)
     97 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
     98 
     99 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
    100 	addq $16, %rdi
    101 
    102 	decl %ecx
    103 	jnz p4_general_loop
    104 
    105 p4_general_done:
    106 	.byte 0xf3
    107 	ret
    108 
    109 .section .rodata
    110 
    111 .align 16
    112 p4_constants:
    113 .byte  0xff, 0xff, 0xff, 0xff
    114 .byte  0xff, 0xff, 0xff, 0xff
    115 .byte  0xff, 0xff, 0xff, 0xff
    116 .byte  0x00, 0x00, 0x00, 0x00
    117 
    118 .byte  0x00, 0x00, 0x00, 0x00
    119 .byte  0x00, 0x00, 0x00, 0x00
    120 .byte  0x00, 0x00, 0x00, 0x00
    121 .float 1.0
    122 
    123 .text
    124 .align 16
    125 .globl _mesa_x86_64_transform_points4_3d
    126 .hidden _mesa_x86_64_transform_points4_3d
    127 /*
    128  * this is slower than _mesa_x86_64_transform_points4_general
    129  * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
    130  */
    131 _mesa_x86_64_transform_points4_3d:
    132 
    133 	leaq p4_constants(%rip), %rax
    134 
    135 	prefetchnta 64(%rsi)
    136 
    137 	movaps (%rax), %xmm9
    138 	movaps 16(%rax), %xmm10
    139 
    140 	movl V4F_COUNT(%rdx), %ecx	/* count */
    141 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    142 
    143 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    144 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    145 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    146 
    147 	testl %ecx, %ecx		/* verify non-zero count */
    148 	jz p4_3d_done
    149 
    150 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
    151 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    152 
    153 	prefetcht1 16(%rdx)
    154 
    155 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
    156 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
    157 	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
    158 	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
    159 	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
    160         movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
    161 	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
    162 	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
    163 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
    164 	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
    165 
    166 p4_3d_loop:
    167 
    168 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
    169 	prefetcht1 16(%rdi)
    170 
    171 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
    172 	addq %rax, %rdx
    173 	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
    174 	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
    175 	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
    176 	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
    177 	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
    178 	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
    179 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
    180 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
    181 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
    182 	prefetcht1 16(%rdx)
    183 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
    184 
    185 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
    186 	addq $16, %rdi
    187 
    188 	dec %ecx
    189 	jnz p4_3d_loop
    190 
    191 p4_3d_done:
    192 	.byte 0xf3
    193 	ret
    194 
    195 
    196 .align 16
    197 .globl _mesa_x86_64_transform_points4_identity
    198 .hidden _mesa_x86_64_transform_points4_identity
    199 _mesa_x86_64_transform_points4_identity:
    200 
    201 	movl V4F_COUNT(%rdx), %ecx	/* count */
    202 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    203 
    204 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    205 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    206 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    207 
    208 	test %ecx, %ecx
    209 	jz p4_identity_done
    210 
    211 	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
    212 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    213 	prefetcht1 64(%rsi)
    214 	prefetcht1 64(%rdi)
    215 
    216 	add %ecx, %ecx
    217 
    218 	rep movsq
    219 
    220 p4_identity_done:
    221 	.byte 0xf3
    222 	ret
    223 
    224 
    225 .align 16
    226 .globl _mesa_3dnow_transform_points4_3d_no_rot
    227 .hidden _mesa_3dnow_transform_points4_3d_no_rot
    228 _mesa_3dnow_transform_points4_3d_no_rot:
    229 
    230 	movl V4F_COUNT(%rdx), %ecx	/* count */
    231 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    232 
    233 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    234 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    235 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
    236 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    237 
    238 	test %ecx, %ecx
    239 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
    240 	jz p4_3d_no_rot_done
    241 
    242 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
    243 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    244 
    245 	prefetcht1 (%rdx)
    246 
    247 	movd (%rsi), %mm0		/*                 | m00             */
    248 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
    249 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
    250 
    251 	movd 40(%rsi), %mm2		/*                 | m22             */
    252 	movq 48(%rsi), %mm1		/* m31             | m30             */
    253 
    254 	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
    255 
    256 p4_3d_no_rot_loop:
    257 
    258 	prefetcht1 32(%rdi)
    259 
    260 	movq  (%rdx), %mm4		/* x1              | x0              */
    261 	movq  8(%rdx), %mm5		/* x3              | x2              */
    262 	movd  12(%rdx), %mm7		/*                 | x3              */
    263 
    264 	movq  %mm5, %mm6		/* x3              | x2              */
    265 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
    266 
    267 	punpckhdq %mm6, %mm6		/* x3              | x3              */
    268 	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
    269 
    270 	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
    271 	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
    272 
    273         pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
    274 
    275 	addq %rax, %rdx
    276 	movq %mm4, (%rdi)		/* write r0, r1                      */
    277 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
    278 
    279 	addq $16, %rdi
    280 
    281 	decl %ecx
    282 	prefetcht1 32(%rdx)
    283 	jnz p4_3d_no_rot_loop
    284 
    285 p4_3d_no_rot_done:
    286 	femms
    287 	ret
    288 
    289 
    290 .align 16
    291 .globl _mesa_3dnow_transform_points4_perspective
    292 .hidden _mesa_3dnow_transform_points4_perspective
    293 _mesa_3dnow_transform_points4_perspective:
    294 
    295 	movl V4F_COUNT(%rdx), %ecx	/* count */
    296 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    297 
    298 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    299 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    300 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    301 
    302 	test %ecx, %ecx
    303 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
    304 	jz p4_perspective_done
    305 
    306 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
    307 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    308 
    309 	movd (%rsi), %mm0		/*                 | m00             */
    310         pxor %mm7, %mm7			/* 0               | 0               */
    311 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
    312 
    313 	movq 32(%rsi), %mm2		/* m21             | m20             */
    314 	prefetcht1 (%rdx)
    315 
    316 	movd 40(%rsi), %mm1		/*                 | m22             */
    317 
    318 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
    319 	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
    320 
    321 
    322 p4_perspective_loop:
    323 
    324 	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
    325 
    326 	movq (%rdx), %mm4		/* x1              | x0              */
    327 	movq 8(%rdx), %mm5		/* x3              | x2              */
    328 	movd 8(%rdx), %mm3		/*                 | x2              */
    329 
    330 	movq %mm5, %mm6			/* x3              | x2              */
    331 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
    332 
    333 	punpckldq %mm5, %mm5		/* x2              | x2              */
    334 
    335 	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
    336 	pfsubr %mm7, %mm3		/*                 | -x2             */
    337 
    338 	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
    339 	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
    340 
    341 	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
    342 
    343 	movq %mm5, (%rdi)		/* write r0, r1                      */
    344 	addq %rax, %rdx
    345 	movq %mm6, 8(%rdi)		/* write r2, r3                      */
    346 
    347 	addq $16, %rdi
    348 
    349 	decl %ecx
    350 	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
    351 	jnz p4_perspective_loop
    352 
    353 p4_perspective_done:
    354 	femms
    355 	ret
    356 
    357 .align 16
    358 .globl _mesa_3dnow_transform_points4_2d_no_rot
    359 .hidden _mesa_3dnow_transform_points4_2d_no_rot
    360 _mesa_3dnow_transform_points4_2d_no_rot:
    361 
    362 	movl V4F_COUNT(%rdx), %ecx	/* count */
    363 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    364 
    365 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    366 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    367 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    368 
    369 	test %ecx, %ecx
    370 	.byte 0x90			/* manual align += 1 */
    371 	jz p4_2d_no_rot_done
    372 
    373 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
    374 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    375 
    376 	movd (%rsi), %mm0		/*                 | m00             */
    377 	prefetcht1 (%rdx)
    378 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
    379 
    380 	movq 48(%rsi), %mm1		/* m31             | m30             */
    381 
    382 p4_2d_no_rot_loop:
    383 
    384 	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
    385 
    386 	movq (%rdx), %mm4		/* x1              | x0              */
    387 	movq 8(%rdx), %mm5		/* x3              | x2              */
    388 
    389 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
    390 	movq %mm5, %mm6			/* x3              | x2              */
    391 
    392 	punpckhdq %mm6, %mm6		/* x3              | x3              */
    393 
    394 	addq %rax, %rdx
    395 	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
    396 
    397 	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
    398 	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
    399 
    400 	movq %mm6, (%rdi)		/* write r0, r1                      */
    401 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
    402 
    403 	addq $16, %rdi
    404 
    405 	decl %ecx
    406 	jnz p4_2d_no_rot_loop
    407 
    408 p4_2d_no_rot_done:
    409 	femms
    410 	ret
    411 
    412 
    413 .align 16
    414 .globl _mesa_3dnow_transform_points4_2d
    415 .hidden _mesa_3dnow_transform_points4_2d
    416 _mesa_3dnow_transform_points4_2d:
    417 
    418 	movl V4F_COUNT(%rdx), %ecx	/* count */
    419 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
    420 
    421 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
    422 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
    423 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
    424 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
    425 
    426 	test %ecx, %ecx
    427 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
    428 	jz p4_2d_done
    429 
    430 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
    431 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
    432 
    433 	movd (%rsi), %mm0		/*                 | m00             */
    434 	movd 4(%rsi), %mm1		/*                 | m01             */
    435 
    436 	prefetcht1 (%rdx)
    437 
    438 	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
    439 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
    440 	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
    441 
    442 	movq 48(%rsi), %mm2		/* m31             | m30             */
    443 
    444 p4_2d_loop:
    445 
    446 	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
    447 
    448 	movq (%rdx), %mm3		/* x1              | x0              */
    449 	movq 8(%rdx), %mm5		/* x3              | x2              */
    450 
    451 	movq %mm3, %mm4			/* x1              | x0              */
    452 	movq %mm5, %mm6			/* x3              | x2              */
    453 
    454 	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
    455 	punpckhdq %mm6, %mm6		/* x3              | x3              */
    456 
    457 	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
    458 
    459 	addq %rax, %rdx
    460 	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
    461 
    462 	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
    463 	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
    464 
    465 	pfadd %mm6, %mm3		/* r1              | r0              */
    466 
    467 	movq %mm3, (%rdi)		/* write r0, r1                      */
    468 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
    469 
    470 	addq $16, %rdi
    471 
    472 	decl %ecx
    473 	jnz p4_2d_loop
    474 
    475 p4_2d_done:
    476 	femms
    477 	ret
    478 
    479 #endif
    480 
    481 #if defined (__ELF__) && defined (__linux__)
    482 	.section .note.GNU-stack,"",%progbits
    483 #endif
    484