1 /* 2 * Mesa 3-D graphics library 3 * 4 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included 14 * in all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25 #ifdef USE_X86_64_ASM 26 27 #include "matypes.h" 28 29 .text 30 31 .align 16 32 .globl _mesa_x86_64_cpuid 33 .hidden _mesa_x86_64_cpuid 34 _mesa_x86_64_cpuid: 35 pushq %rbx 36 movl (%rdi), %eax 37 movl 8(%rdi), %ecx 38 39 cpuid 40 41 movl %ebx, 4(%rdi) 42 movl %eax, (%rdi) 43 movl %ecx, 8(%rdi) 44 movl %edx, 12(%rdi) 45 popq %rbx 46 ret 47 48 .align 16 49 .globl _mesa_x86_64_transform_points4_general 50 .hidden _mesa_x86_64_transform_points4_general 51 _mesa_x86_64_transform_points4_general: 52 /* 53 * rdi = dest 54 * rsi = matrix 55 * rdx = source 56 */ 57 movl V4F_COUNT(%rdx), %ecx /* count */ 58 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 59 60 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 61 movl $4, V4F_SIZE(%rdi) /* set dest size */ 62 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 63 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 64 65 testl %ecx, %ecx /* verify non-zero count */ 66 prefetchnta 64(%rsi) 67 jz p4_general_done 68 69 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 70 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 71 72 prefetcht1 16(%rdx) 73 74 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 75 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 76 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 77 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 78 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 79 80 p4_general_loop: 81 82 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 83 prefetcht1 16(%rdi) 84 85 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 86 addq %rax, %rdx 87 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 88 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 89 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 90 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 91 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 92 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 93 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 94 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 95 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 96 prefetcht1 16(%rdx) 97 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 98 99 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 100 addq $16, %rdi 101 102 decl %ecx 103 jnz p4_general_loop 104 105 p4_general_done: 106 .byte 0xf3 107 ret 108 109 .section .rodata 110 111 .align 16 112 p4_constants: 113 .byte 0xff, 0xff, 0xff, 0xff 114 .byte 0xff, 0xff, 0xff, 0xff 115 .byte 0xff, 0xff, 0xff, 0xff 116 .byte 0x00, 0x00, 0x00, 0x00 117 118 .byte 0x00, 0x00, 0x00, 0x00 119 .byte 0x00, 0x00, 0x00, 0x00 120 .byte 0x00, 0x00, 0x00, 0x00 121 .float 1.0 122 123 .text 124 .align 16 125 .globl _mesa_x86_64_transform_points4_3d 126 .hidden _mesa_x86_64_transform_points4_3d 127 /* 128 * this is slower than _mesa_x86_64_transform_points4_general 129 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 130 */ 131 _mesa_x86_64_transform_points4_3d: 132 133 leaq p4_constants(%rip), %rax 134 135 prefetchnta 64(%rsi) 136 137 movaps (%rax), %xmm9 138 movaps 16(%rax), %xmm10 139 140 movl V4F_COUNT(%rdx), %ecx /* count */ 141 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 142 143 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 144 movl $4, V4F_SIZE(%rdi) /* set dest size */ 145 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 146 147 testl %ecx, %ecx /* verify non-zero count */ 148 jz p4_3d_done 149 150 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 151 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 152 153 prefetcht1 16(%rdx) 154 155 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 156 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 157 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 158 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 159 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 160 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 161 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 162 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 163 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 164 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 165 166 p4_3d_loop: 167 168 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 169 prefetcht1 16(%rdi) 170 171 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 172 addq %rax, %rdx 173 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 174 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 175 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 176 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 177 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 178 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 179 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 180 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 181 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 182 prefetcht1 16(%rdx) 183 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 184 185 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 186 addq $16, %rdi 187 188 dec %ecx 189 jnz p4_3d_loop 190 191 p4_3d_done: 192 .byte 0xf3 193 ret 194 195 196 .align 16 197 .globl _mesa_x86_64_transform_points4_identity 198 .hidden _mesa_x86_64_transform_points4_identity 199 _mesa_x86_64_transform_points4_identity: 200 201 movl V4F_COUNT(%rdx), %ecx /* count */ 202 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 203 204 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 205 movl $4, V4F_SIZE(%rdi) /* set dest size */ 206 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 207 208 test %ecx, %ecx 209 jz p4_identity_done 210 211 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 212 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 213 prefetcht1 64(%rsi) 214 prefetcht1 64(%rdi) 215 216 add %ecx, %ecx 217 218 rep movsq 219 220 p4_identity_done: 221 .byte 0xf3 222 ret 223 224 225 .align 16 226 .globl _mesa_3dnow_transform_points4_3d_no_rot 227 .hidden _mesa_3dnow_transform_points4_3d_no_rot 228 _mesa_3dnow_transform_points4_3d_no_rot: 229 230 movl V4F_COUNT(%rdx), %ecx /* count */ 231 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 232 233 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 234 movl $4, V4F_SIZE(%rdi) /* set dest size */ 235 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 236 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 237 238 test %ecx, %ecx 239 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 240 jz p4_3d_no_rot_done 241 242 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 243 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 244 245 prefetcht1 (%rdx) 246 247 movd (%rsi), %mm0 /* | m00 */ 248 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 249 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 250 251 movd 40(%rsi), %mm2 /* | m22 */ 252 movq 48(%rsi), %mm1 /* m31 | m30 */ 253 254 punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 255 256 p4_3d_no_rot_loop: 257 258 prefetcht1 32(%rdi) 259 260 movq (%rdx), %mm4 /* x1 | x0 */ 261 movq 8(%rdx), %mm5 /* x3 | x2 */ 262 movd 12(%rdx), %mm7 /* | x3 */ 263 264 movq %mm5, %mm6 /* x3 | x2 */ 265 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 266 267 punpckhdq %mm6, %mm6 /* x3 | x3 */ 268 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 269 270 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 271 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 272 273 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 274 275 addq %rax, %rdx 276 movq %mm4, (%rdi) /* write r0, r1 */ 277 movq %mm5, 8(%rdi) /* write r2, r3 */ 278 279 addq $16, %rdi 280 281 decl %ecx 282 prefetcht1 32(%rdx) 283 jnz p4_3d_no_rot_loop 284 285 p4_3d_no_rot_done: 286 femms 287 ret 288 289 290 .align 16 291 .globl _mesa_3dnow_transform_points4_perspective 292 .hidden _mesa_3dnow_transform_points4_perspective 293 _mesa_3dnow_transform_points4_perspective: 294 295 movl V4F_COUNT(%rdx), %ecx /* count */ 296 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 297 298 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 299 movl $4, V4F_SIZE(%rdi) /* set dest size */ 300 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 301 302 test %ecx, %ecx 303 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 304 jz p4_perspective_done 305 306 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 307 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 308 309 movd (%rsi), %mm0 /* | m00 */ 310 pxor %mm7, %mm7 /* 0 | 0 */ 311 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 312 313 movq 32(%rsi), %mm2 /* m21 | m20 */ 314 prefetcht1 (%rdx) 315 316 movd 40(%rsi), %mm1 /* | m22 */ 317 318 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 319 punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 320 321 322 p4_perspective_loop: 323 324 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 325 326 movq (%rdx), %mm4 /* x1 | x0 */ 327 movq 8(%rdx), %mm5 /* x3 | x2 */ 328 movd 8(%rdx), %mm3 /* | x2 */ 329 330 movq %mm5, %mm6 /* x3 | x2 */ 331 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 332 333 punpckldq %mm5, %mm5 /* x2 | x2 */ 334 335 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 336 pfsubr %mm7, %mm3 /* | -x2 */ 337 338 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 339 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 340 341 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 342 343 movq %mm5, (%rdi) /* write r0, r1 */ 344 addq %rax, %rdx 345 movq %mm6, 8(%rdi) /* write r2, r3 */ 346 347 addq $16, %rdi 348 349 decl %ecx 350 prefetcht1 32(%rdx) /* hopefully stride is zero */ 351 jnz p4_perspective_loop 352 353 p4_perspective_done: 354 femms 355 ret 356 357 .align 16 358 .globl _mesa_3dnow_transform_points4_2d_no_rot 359 .hidden _mesa_3dnow_transform_points4_2d_no_rot 360 _mesa_3dnow_transform_points4_2d_no_rot: 361 362 movl V4F_COUNT(%rdx), %ecx /* count */ 363 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 364 365 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 366 movl $4, V4F_SIZE(%rdi) /* set dest size */ 367 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 368 369 test %ecx, %ecx 370 .byte 0x90 /* manual align += 1 */ 371 jz p4_2d_no_rot_done 372 373 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 374 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 375 376 movd (%rsi), %mm0 /* | m00 */ 377 prefetcht1 (%rdx) 378 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 379 380 movq 48(%rsi), %mm1 /* m31 | m30 */ 381 382 p4_2d_no_rot_loop: 383 384 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 385 386 movq (%rdx), %mm4 /* x1 | x0 */ 387 movq 8(%rdx), %mm5 /* x3 | x2 */ 388 389 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 390 movq %mm5, %mm6 /* x3 | x2 */ 391 392 punpckhdq %mm6, %mm6 /* x3 | x3 */ 393 394 addq %rax, %rdx 395 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 396 397 prefetcht1 32(%rdx) /* hopefully stride is zero */ 398 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 399 400 movq %mm6, (%rdi) /* write r0, r1 */ 401 movq %mm5, 8(%rdi) /* write r2, r3 */ 402 403 addq $16, %rdi 404 405 decl %ecx 406 jnz p4_2d_no_rot_loop 407 408 p4_2d_no_rot_done: 409 femms 410 ret 411 412 413 .align 16 414 .globl _mesa_3dnow_transform_points4_2d 415 .hidden _mesa_3dnow_transform_points4_2d 416 _mesa_3dnow_transform_points4_2d: 417 418 movl V4F_COUNT(%rdx), %ecx /* count */ 419 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 420 421 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 422 movl $4, V4F_SIZE(%rdi) /* set dest size */ 423 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 424 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 425 426 test %ecx, %ecx 427 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 428 jz p4_2d_done 429 430 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 431 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 432 433 movd (%rsi), %mm0 /* | m00 */ 434 movd 4(%rsi), %mm1 /* | m01 */ 435 436 prefetcht1 (%rdx) 437 438 punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 439 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 440 punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 441 442 movq 48(%rsi), %mm2 /* m31 | m30 */ 443 444 p4_2d_loop: 445 446 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 447 448 movq (%rdx), %mm3 /* x1 | x0 */ 449 movq 8(%rdx), %mm5 /* x3 | x2 */ 450 451 movq %mm3, %mm4 /* x1 | x0 */ 452 movq %mm5, %mm6 /* x3 | x2 */ 453 454 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 455 punpckhdq %mm6, %mm6 /* x3 | x3 */ 456 457 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 458 459 addq %rax, %rdx 460 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 461 462 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 463 prefetcht1 32(%rdx) /* hopefully stride is zero */ 464 465 pfadd %mm6, %mm3 /* r1 | r0 */ 466 467 movq %mm3, (%rdi) /* write r0, r1 */ 468 movq %mm5, 8(%rdi) /* write r2, r3 */ 469 470 addq $16, %rdi 471 472 decl %ecx 473 jnz p4_2d_loop 474 475 p4_2d_done: 476 femms 477 ret 478 479 #endif 480 481 #if defined (__ELF__) && defined (__linux__) 482 .section .note.GNU-stack,"",%progbits 483 #endif 484