Home | History | Annotate | Download | only in x86
      1 
      2 /*
      3  * Mesa 3-D graphics library
      4  *
      5  * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
      6  *
      7  * Permission is hereby granted, free of charge, to any person obtaining a
      8  * copy of this software and associated documentation files (the "Software"),
      9  * to deal in the Software without restriction, including without limitation
     10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
     11  * and/or sell copies of the Software, and to permit persons to whom the
     12  * Software is furnished to do so, subject to the following conditions:
     13  *
     14  * The above copyright notice and this permission notice shall be included
     15  * in all copies or substantial portions of the Software.
     16  *
     17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
     18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     20  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
     21  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
     22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
     23  * OTHER DEALINGS IN THE SOFTWARE.
     24  */
     25 
     26 /** TODO:
     27   * - insert PREFETCH instructions to avoid cache-misses !
     28   * - some more optimizations are possible...
     29   * - for 40-50% more performance in the SSE-functions, the
     30   *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
     31   */
     32 
     33 #ifdef USE_SSE_ASM
     34 #include "assyntax.h"
     35 #include "matypes.h"
     36 #include "xform_args.h"
     37 
     38    SEG_TEXT
     39 
     40 #define S(i) 	REGOFF(i * 4, ESI)
     41 #define D(i) 	REGOFF(i * 4, EDI)
     42 #define M(i) 	REGOFF(i * 4, EDX)
     43 
     44 
     45 ALIGNTEXT4
     46 GLOBL GLNAME(_mesa_sse_transform_points3_general)
     47 HIDDEN(_mesa_sse_transform_points3_general)
     48 GLNAME( _mesa_sse_transform_points3_general ):
     49 
     50 #define FRAME_OFFSET 8
     51     PUSH_L    ( ESI )
     52     PUSH_L    ( EDI )
     53 
     54     MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
     55     MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
     56 
     57     MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
     58     MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
     59 
     60     CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
     61     JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
     62 
     63     MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
     64     OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
     65 
     66     MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
     67     MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
     68 
     69     SHL_L( CONST(4), ECX ) 			/* count *= 16 */
     70     MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
     71 
     72     MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
     73     ADD_L( EDI, ECX ) 				/* count += dest ptr */
     74 
     75 
     76 ALIGNTEXT32
     77     MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
     78     MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
     79     MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
     80     MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
     81 
     82 
     83 ALIGNTEXT32
     84 LLBL(K_GTPGR_top):
     85     MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
     86     SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
     87     MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
     88     SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
     89     MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
     90     SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
     91 
     92     MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
     93     MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
     94     MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
     95 
     96     ADDPS     ( XMM5, XMM4 )
     97     ADDPS     ( XMM6, XMM4 )
     98     ADDPS     ( XMM3, XMM4 )
     99 
    100     MOVAPS    ( XMM4, REGOFF(0, EDI) )
    101 
    102 LLBL(K_GTPGR_skip):
    103     ADD_L     ( CONST(16), EDI )
    104     ADD_L     ( EAX, ESI )
    105     CMP_L     ( ECX, EDI )
    106     JNE       ( LLBL(K_GTPGR_top) )
    107 
    108 LLBL(K_GTPGR_finish):
    109     POP_L     ( EDI )
    110     POP_L     ( ESI )
    111     RET
    112 #undef FRAME_OFFSET
    113 
    114 
    115 ALIGNTEXT4
    116 GLOBL GLNAME(_mesa_sse_transform_points3_identity)
    117 HIDDEN(_mesa_sse_transform_points3_identity)
    118 GLNAME( _mesa_sse_transform_points3_identity ):
    119 
    120 #define FRAME_OFFSET 8
    121     PUSH_L    ( ESI )
    122     PUSH_L    ( EDI )
    123 
    124     MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    125     MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    126 
    127     MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    128 
    129     TEST_L( ECX, ECX)
    130     JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
    131 
    132     MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    133     OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    134 
    135     MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    136     MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    137 
    138     SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    139     MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    140 
    141     MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    142     ADD_L( EDI, ECX ) 				/* count += dest ptr */
    143 
    144     CMP_L( ESI, EDI )
    145     JE( LLBL(K_GTPIR_finish) )
    146 
    147 
    148 ALIGNTEXT32
    149 LLBL(K_GTPIR_top):
    150     MOVLPS    ( S(0), XMM0 )
    151     MOVLPS    ( XMM0, D(0) )
    152     MOVSS     ( S(2), XMM0 )
    153     MOVSS     ( XMM0, D(2) )
    154 
    155 LLBL(K_GTPIR_skip):
    156     ADD_L     ( CONST(16), EDI )
    157     ADD_L     ( EAX, ESI )
    158     CMP_L     ( ECX, EDI )
    159     JNE       ( LLBL(K_GTPIR_top) )
    160 
    161 LLBL(K_GTPIR_finish):
    162     POP_L     ( EDI )
    163     POP_L     ( ESI )
    164     RET
    165 #undef FRAME_OFFSET
    166 
    167 
    168 
    169 
    170 ALIGNTEXT4
    171 GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
    172 HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
    173 GLNAME(_mesa_sse_transform_points3_3d_no_rot):
    174 
    175 #define FRAME_OFFSET 8
    176     PUSH_L( ESI )
    177     PUSH_L( EDI )
    178 
    179     MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    180     MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    181 
    182 
    183     MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
    184     MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    185 
    186     TEST_L( ECX, ECX)
    187     JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
    188 
    189     MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    190     OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    191 
    192     MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    193     MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    194 
    195     SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    196     MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    197 
    198     MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    199     ADD_L( EDI, ECX ) 				/* count += dest ptr */
    200 
    201     XORPS( XMM0, XMM0 )                         /* clean the working register */
    202 
    203 ALIGNTEXT32
    204     MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
    205     MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
    206     UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
    207     MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
    208     MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
    209     MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
    210 
    211 ALIGNTEXT32
    212 LLBL(K_GTP3DNRR_top):
    213 
    214     MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
    215     MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
    216     ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
    217     MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
    218 
    219     MOVSS    ( S(2), XMM0 )			/* sz */
    220     MULSS    ( XMM3, XMM0 )			/* sz*m10 */
    221     ADDSS    ( XMM4, XMM0 )			/* +m14 */
    222     MOVSS    ( XMM0, D(2) )			/* -> D(2) */
    223 
    224 LLBL(K_GTP3DNRR_skip):
    225     ADD_L    ( CONST(16), EDI )
    226     ADD_L    ( EAX, ESI )
    227     CMP_L    ( ECX, EDI )
    228     JNE      ( LLBL(K_GTP3DNRR_top) )
    229 
    230 LLBL(K_GTP3DNRR_finish):
    231     POP_L    ( EDI )
    232     POP_L    ( ESI )
    233     RET
    234 #undef FRAME_OFFSET
    235 
    236 
    237 
    238 ALIGNTEXT4
    239 GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
    240 HIDDEN(_mesa_sse_transform_points3_perspective)
    241 GLNAME(_mesa_sse_transform_points3_perspective):
    242 
    243 #define FRAME_OFFSET 8
    244     PUSH_L   ( ESI )
    245     PUSH_L   ( EDI )
    246 
    247     MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    248     MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    249 
    250     MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
    251     MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    252 
    253     TEST_L( ECX, ECX)
    254     JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
    255 
    256     MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    257     OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    258 
    259     MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    260     MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    261 
    262     SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    263     MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    264 
    265     MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    266     ADD_L( EDI, ECX ) 				/* count += dest ptr */
    267 
    268 ALIGNTEXT32
    269     MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
    270     MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
    271     UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
    272     MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
    273     MOVSS    ( M(10), XMM3 )			/* m10 */
    274     MOVSS    ( M(14), XMM4 )			/* m14 */
    275     XORPS    ( XMM6, XMM6 )			/* 0 */
    276 
    277 ALIGNTEXT32
    278 LLBL(K_GTP3PR_top):
    279     MOVLPS   ( S(0), XMM0 )			/* oy | ox */
    280     MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
    281     MOVSS    ( S(2), XMM5 )			/* oz */
    282     SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
    283     MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
    284     ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
    285     MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
    286 
    287     MOVSS    ( S(2), XMM0 )			/* oz */
    288     MULSS    ( XMM3, XMM0 )			/* oz*m10 */
    289     ADDSS    ( XMM4, XMM0 )			/* +m14 */
    290     MOVSS    ( XMM0, D(2) )			/* ->D(2) */
    291 
    292     MOVSS    ( S(2), XMM0 )			/* oz */
    293     MOVSS    ( XMM6, XMM5 )			/* 0 */
    294     SUBPS    ( XMM0, XMM5 )			/* -oz */
    295     MOVSS    ( XMM5, D(3) )			/* ->D(3) */
    296 
    297 LLBL(K_GTP3PR_skip):
    298     ADD_L( CONST(16), EDI )
    299     ADD_L( EAX, ESI )
    300     CMP_L( ECX, EDI )
    301     JNE( LLBL(K_GTP3PR_top) )
    302 
    303 LLBL(K_GTP3PR_finish):
    304     POP_L    ( EDI )
    305     POP_L    ( ESI )
    306     RET
    307 #undef FRAME_OFFSET
    308 
    309 
    310 
    311 ALIGNTEXT4
    312 GLOBL GLNAME(_mesa_sse_transform_points3_2d)
    313 HIDDEN(_mesa_sse_transform_points3_2d)
    314 GLNAME(_mesa_sse_transform_points3_2d):
    315 
    316 #define FRAME_OFFSET 8
    317     PUSH_L( ESI )
    318     PUSH_L( EDI )
    319 
    320     MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    321     MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    322 
    323     MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
    324     MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    325 
    326     TEST_L( ECX, ECX)
    327     JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
    328 
    329     MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    330     OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    331 
    332     MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    333     MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    334 
    335     SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    336     MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    337 
    338     MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    339     ADD_L( EDI, ECX ) 				/* count += dest ptr */
    340 
    341 ALIGNTEXT32
    342     MOVLPS( M(0), XMM0 )			/* m1  | m0 */
    343     MOVLPS( M(4), XMM1 )			/* m5  | m4 */
    344     MOVLPS( M(12), XMM2 )			/* m13 | m12 */
    345 
    346 ALIGNTEXT32
    347 LLBL(K_GTP3P2DR_top):
    348     MOVSS    ( S(0), XMM3 )			/* ox */
    349     SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
    350     MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
    351     MOVSS    ( S(1), XMM4 )			/* oy */
    352     SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
    353     MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
    354 
    355     ADDPS    ( XMM4, XMM3 )
    356     ADDPS    ( XMM2, XMM3 )
    357     MOVLPS   ( XMM3, D(0) )
    358 
    359     MOVSS    ( S(2), XMM3 )
    360     MOVSS    ( XMM3, D(2) )
    361 
    362 LLBL(K_GTP3P2DR_skip):
    363     ADD_L    ( CONST(16), EDI )
    364     ADD_L    ( EAX, ESI )
    365     CMP_L    ( ECX, EDI )
    366     JNE      ( LLBL(K_GTP3P2DR_top) )
    367 
    368 LLBL(K_GTP3P2DR_finish):
    369     POP_L    ( EDI )
    370     POP_L    ( ESI )
    371     RET
    372 #undef FRAME_OFFSET
    373 
    374 
    375 
    376 ALIGNTEXT4
    377 GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
    378 HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
    379 GLNAME(_mesa_sse_transform_points3_2d_no_rot):
    380 
    381 #define FRAME_OFFSET 8
    382 	PUSH_L( ESI )
    383 	PUSH_L( EDI )
    384 
    385 	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    386 	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    387 
    388 	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
    389 	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    390 
    391 	TEST_L( ECX, ECX)
    392 	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
    393 
    394 	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    395 	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    396 
    397 	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    398 	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    399 
    400 	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    401 	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    402 
    403 	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    404 	ADD_L( EDI, ECX ) 			/* count += dest ptr */
    405 
    406 ALIGNTEXT32
    407 	MOVSS    ( M(0), XMM1 )			/* m0 */
    408 	MOVSS    ( M(5), XMM2 )			/* m5 */
    409 	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
    410 	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
    411 
    412 ALIGNTEXT32
    413 LLBL(K_GTP3P2DNRR_top):
    414 	MOVLPS( S(0), XMM0 )			/* oy | ox */
    415 	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
    416 	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
    417 	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
    418 
    419 	MOVSS( S(2), XMM0 )
    420 	MOVSS( XMM0, D(2) )
    421 
    422 LLBL(K_GTP3P2DNRR_skip):
    423 	ADD_L( CONST(16), EDI )
    424 	ADD_L( EAX, ESI )
    425 	CMP_L( ECX, EDI )
    426 	JNE( LLBL(K_GTP3P2DNRR_top) )
    427 
    428 LLBL(K_GTP3P2DNRR_finish):
    429 	POP_L( EDI )
    430 	POP_L( ESI )
    431 	RET
    432 #undef FRAME_OFFSET
    433 
    434 
    435 
    436 
    437 ALIGNTEXT4
    438 GLOBL GLNAME(_mesa_sse_transform_points3_3d)
    439 HIDDEN(_mesa_sse_transform_points3_3d)
    440 GLNAME(_mesa_sse_transform_points3_3d):
    441 
    442 #define FRAME_OFFSET 8
    443 	PUSH_L( ESI )
    444 	PUSH_L( EDI )
    445 
    446 	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
    447 	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
    448 
    449 
    450 	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
    451 	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
    452 
    453 	TEST_L( ECX, ECX)
    454 	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
    455 
    456 	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
    457 	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
    458 
    459 	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
    460 	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
    461 
    462 	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
    463 	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
    464 
    465 	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
    466 	ADD_L( EDI, ECX ) 			/* count += dest ptr */
    467 
    468 
    469 ALIGNTEXT32
    470 	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
    471 	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
    472 	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
    473 	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
    474 
    475 ALIGNTEXT32
    476 LLBL(K_GTP3P3DR_top):
    477 	MOVSS( S(0), XMM4 )
    478 	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
    479 	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
    480 
    481 	MOVSS( S(1), XMM5 )
    482 	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
    483 	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
    484 
    485 	MOVSS( S(2), XMM6 )
    486 	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
    487 	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
    488 
    489 	ADDPS( XMM5, XMM4 )			/* + | + | + */
    490 	ADDPS( XMM6, XMM4 )			/* + | + | + */
    491 	ADDPS( XMM3, XMM4 )			/* + | + | + */
    492 
    493 	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
    494 	UNPCKHPS( XMM4, XMM4 )
    495 	MOVSS( XMM4, D(2) )
    496 
    497 LLBL(K_GTP3P3DR_skip):
    498 	ADD_L( CONST(16), EDI )
    499 	ADD_L( EAX, ESI )
    500 	CMP_L( ECX, EDI )
    501 	JNE( LLBL(K_GTP3P3DR_top) )
    502 
    503 LLBL(K_GTP3P3DR_finish):
    504 	POP_L( EDI )
    505 	POP_L( ESI )
    506 	RET
    507 #undef FRAME_OFFSET
    508 #endif
    509 
    510 #if defined (__ELF__) && defined (__linux__)
    511 	.section .note.GNU-stack,"",%progbits
    512 #endif
    513