Home | History | Annotate | Download | only in WinQuake
      1 /*
      2 Copyright (C) 1996-1997 Id Software, Inc.
      3 
      4 This program is free software; you can redistribute it and/or
      5 modify it under the terms of the GNU General Public License
      6 as published by the Free Software Foundation; either version 2
      7 of the License, or (at your option) any later version.
      8 
      9 This program is distributed in the hope that it will be useful,
     10 but WITHOUT ANY WARRANTY; without even the implied warranty of
     11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
     12 
     13 See the GNU General Public License for more details.
     14 
     15 You should have received a copy of the GNU General Public License
     16 along with this program; if not, write to the Free Software
     17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
     18 
     19 */
     20 //
     21 // d_draw16.s
     22 // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel
     23 // subdivision.
     24 //
     25 
     26 #include "asm_i386.h"
     27 #include "quakeasm.h"
     28 #include "asm_draw.h"
     29 #include "d_ifacea.h"
     30 
     31 #if	id386
     32 
     33 //----------------------------------------------------------------------
     34 // 8-bpp horizontal span drawing code for polygons, with no transparency and
     35 // 16-pixel subdivision.
     36 //
     37 // Assumes there is at least one span in pspans, and that every span
     38 // contains at least one pixel
     39 //----------------------------------------------------------------------
     40 
     41 	.data
     42 
     43 	.text
     44 
     45 // out-of-line, rarely-needed clamping code
     46 
     47 LClampHigh0:
     48 	movl	C(bbextents),%esi
     49 	jmp		LClampReentry0
     50 LClampHighOrLow0:
     51 	jg		LClampHigh0
     52 	xorl	%esi,%esi
     53 	jmp		LClampReentry0
     54 
     55 LClampHigh1:
     56 	movl	C(bbextentt),%edx
     57 	jmp		LClampReentry1
     58 LClampHighOrLow1:
     59 	jg		LClampHigh1
     60 	xorl	%edx,%edx
     61 	jmp		LClampReentry1
     62 
     63 LClampLow2:
     64 	movl	$4096,%ebp
     65 	jmp		LClampReentry2
     66 LClampHigh2:
     67 	movl	C(bbextents),%ebp
     68 	jmp		LClampReentry2
     69 
     70 LClampLow3:
     71 	movl	$4096,%ecx
     72 	jmp		LClampReentry3
     73 LClampHigh3:
     74 	movl	C(bbextentt),%ecx
     75 	jmp		LClampReentry3
     76 
     77 LClampLow4:
     78 	movl	$4096,%eax
     79 	jmp		LClampReentry4
     80 LClampHigh4:
     81 	movl	C(bbextents),%eax
     82 	jmp		LClampReentry4
     83 
     84 LClampLow5:
     85 	movl	$4096,%ebx
     86 	jmp		LClampReentry5
     87 LClampHigh5:
     88 	movl	C(bbextentt),%ebx
     89 	jmp		LClampReentry5
     90 
     91 
     92 #define pspans	4+16
     93 
     94 	.align 4
     95 .globl C(D_DrawSpans16)
     96 C(D_DrawSpans16):
     97 	pushl	%ebp				// preserve caller's stack frame
     98 	pushl	%edi
     99 	pushl	%esi				// preserve register variables
    100 	pushl	%ebx
    101 
    102 //
    103 // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock
    104 // and span list pointers
    105 //
    106 // TODO: any overlap from rearranging?
    107 	flds	C(d_sdivzstepu)
    108 	fmuls	fp_16
    109 	movl	C(cacheblock),%edx
    110 	flds	C(d_tdivzstepu)
    111 	fmuls	fp_16
    112 	movl	pspans(%esp),%ebx	// point to the first span descriptor
    113 	flds	C(d_zistepu)
    114 	fmuls	fp_16
    115 	movl	%edx,pbase			// pbase = cacheblock
    116 	fstps	zi16stepu
    117 	fstps	tdivz16stepu
    118 	fstps	sdivz16stepu
    119 
    120 LSpanLoop:
    121 //
    122 // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
    123 // initial s and t values
    124 //
    125 // FIXME: pipeline FILD?
    126 	fildl	espan_t_v(%ebx)
    127 	fildl	espan_t_u(%ebx)
    128 
    129 	fld		%st(1)			// dv | du | dv
    130 	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
    131 	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
    132 	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
    133 	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
    134 	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
    135 							//  dv*d_sdivzstepv | du | dv
    136 	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
    137 							//  dv*d_sdivzstepv | du | dv
    138 	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
    139 							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
    140 	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
    141 							//  du*d_tdivzstepu | du | dv
    142 	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
    143 							//  du*d_tdivzstepu | du | dv
    144 	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
    145 							//  du*d_sdivzstepu + dv*d_sdivzstepv |
    146 							//  du*d_tdivzstepu | du | dv
    147 	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
    148 							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
    149 	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
    150 							//  du*d_sdivzstepu; stays in %st(2) at end
    151 	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
    152 							//  s/z
    153 	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
    154 							//  du*d_tdivzstepu | du | s/z
    155 	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
    156 							//  du*d_tdivzstepu | du | s/z
    157 	faddp	%st(0),%st(2)	// dv*d_zistepv |
    158 							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
    159 	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
    160 							//  dv*d_zistepv | s/z
    161 	fmuls	C(d_zistepu)		// du*d_zistepu |
    162 							//  dv*d_tdivzstepv + du*d_tdivzstepu |
    163 							//  dv*d_zistepv | s/z
    164 	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
    165 							//  du*d_zistepu | dv*d_zistepv | s/z
    166 	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
    167 							//  du*d_tdivzstepu; stays in %st(1) at end
    168 	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
    169 	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
    170 
    171 	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
    172 	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
    173 	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
    174 							//  du*d_zistepu; stays in %st(0) at end
    175 							// 1/z | fp_64k | t/z | s/z
    176 //
    177 // calculate and clamp s & t
    178 //
    179 	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
    180 
    181 //
    182 // point %edi to the first pixel in the span
    183 //
    184 	movl	C(d_viewbuffer),%ecx
    185 	movl	espan_t_v(%ebx),%eax
    186 	movl	%ebx,pspantemp	// preserve spans pointer
    187 
    188 	movl	C(tadjust),%edx
    189 	movl	C(sadjust),%esi
    190 	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
    191 	addl	%ecx,%edi
    192 	movl	espan_t_u(%ebx),%ecx
    193 	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
    194 	movl	espan_t_count(%ebx),%ecx
    195 
    196 //
    197 // now start the FDIV for the end of the span
    198 //
    199 	cmpl	$16,%ecx
    200 	ja		LSetupNotLast1
    201 
    202 	decl	%ecx
    203 	jz		LCleanup1		// if only one pixel, no need to start an FDIV
    204 	movl	%ecx,spancountminus1
    205 
    206 // finish up the s and t calcs
    207 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    208 
    209 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    210 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    211 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    212 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    213 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    214 	fistpl	s				// 1/z | t | t/z | s/z
    215 	fistpl	t				// 1/z | t/z | s/z
    216 
    217 	fildl	spancountminus1
    218 
    219 	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
    220 	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
    221 	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
    222 	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
    223 	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
    224 	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
    225 	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
    226 							//  C(d_tdivzstepu)*scm1
    227 	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
    228 							//  C(d_tdivzstepu)*scm1
    229 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
    230 	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
    231 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
    232 	faddp	%st(0),%st(3)
    233 
    234 	flds	fp_64k
    235 	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
    236 							//  overlap
    237 	jmp		LFDIVInFlight1
    238 
    239 LCleanup1:
    240 // finish up the s and t calcs
    241 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    242 
    243 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    244 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    245 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    246 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    247 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    248 	fistpl	s				// 1/z | t | t/z | s/z
    249 	fistpl	t				// 1/z | t/z | s/z
    250 	jmp		LFDIVInFlight1
    251 
    252 	.align	4
    253 LSetupNotLast1:
    254 // finish up the s and t calcs
    255 	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
    256 
    257 	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
    258 	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
    259 	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
    260 	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
    261 	fxch	%st(1)			// s | t | 1/z | t/z | s/z
    262 	fistpl	s				// 1/z | t | t/z | s/z
    263 	fistpl	t				// 1/z | t/z | s/z
    264 
    265 	fadds	zi16stepu
    266 	fxch	%st(2)
    267 	fadds	sdivz16stepu
    268 	fxch	%st(2)
    269 	flds	tdivz16stepu
    270 	faddp	%st(0),%st(2)
    271 	flds	fp_64k
    272 	fdiv	%st(1),%st(0)	// z = 1/1/z
    273 							// this is what we've gone to all this trouble to
    274 							//  overlap
    275 LFDIVInFlight1:
    276 
    277 	addl	s,%esi
    278 	addl	t,%edx
    279 	movl	C(bbextents),%ebx
    280 	movl	C(bbextentt),%ebp
    281 	cmpl	%ebx,%esi
    282 	ja		LClampHighOrLow0
    283 LClampReentry0:
    284 	movl	%esi,s
    285 	movl	pbase,%ebx
    286 	shll	$16,%esi
    287 	cmpl	%ebp,%edx
    288 	movl	%esi,sfracf
    289 	ja		LClampHighOrLow1
    290 LClampReentry1:
    291 	movl	%edx,t
    292 	movl	s,%esi					// sfrac = scans->sfrac;
    293 	shll	$16,%edx
    294 	movl	t,%eax					// tfrac = scans->tfrac;
    295 	sarl	$16,%esi
    296 	movl	%edx,tfracf
    297 
    298 //
    299 // calculate the texture starting address
    300 //
    301 	sarl	$16,%eax
    302 	movl	C(cachewidth),%edx
    303 	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
    304 	addl	%ebx,%esi
    305 	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
    306 									//           ((tfrac >> 16) * cachewidth);
    307 //
    308 // determine whether last span or not
    309 //
    310 	cmpl	$16,%ecx
    311 	jna		LLastSegment
    312 
    313 //
    314 // not the last segment; do full 16-wide segment
    315 //
    316 LNotLastSegment:
    317 
    318 //
    319 // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
    320 // get there
    321 //
    322 
    323 // pick up after the FDIV that was left in flight previously
    324 
    325 	fld		%st(0)			// duplicate it
    326 	fmul	%st(4),%st(0)	// s = s/z * z
    327 	fxch	%st(1)
    328 	fmul	%st(3),%st(0)	// t = t/z * z
    329 	fxch	%st(1)
    330 	fistpl	snext
    331 	fistpl	tnext
    332 	movl	snext,%eax
    333 	movl	tnext,%edx
    334 
    335 	movb	(%esi),%bl	// get first source texel
    336 	subl	$16,%ecx		// count off this segments' pixels
    337 	movl	C(sadjust),%ebp
    338 	movl	%ecx,counttemp	// remember count of remaining pixels
    339 
    340 	movl	C(tadjust),%ecx
    341 	movb	%bl,(%edi)	// store first dest pixel
    342 
    343 	addl	%eax,%ebp
    344 	addl	%edx,%ecx
    345 
    346 	movl	C(bbextents),%eax
    347 	movl	C(bbextentt),%edx
    348 
    349 	cmpl	$4096,%ebp
    350 	jl		LClampLow2
    351 	cmpl	%eax,%ebp
    352 	ja		LClampHigh2
    353 LClampReentry2:
    354 
    355 	cmpl	$4096,%ecx
    356 	jl		LClampLow3
    357 	cmpl	%edx,%ecx
    358 	ja		LClampHigh3
    359 LClampReentry3:
    360 
    361 	movl	%ebp,snext
    362 	movl	%ecx,tnext
    363 
    364 	subl	s,%ebp
    365 	subl	t,%ecx
    366 
    367 //
    368 // set up advancetable
    369 //
    370 	movl	%ecx,%eax
    371 	movl	%ebp,%edx
    372 	sarl	$20,%eax			// tstep >>= 16;
    373 	jz		LZero
    374 	sarl	$20,%edx			// sstep >>= 16;
    375 	movl	C(cachewidth),%ebx
    376 	imull	%ebx,%eax
    377 	jmp		LSetUp1
    378 
    379 LZero:
    380 	sarl	$20,%edx			// sstep >>= 16;
    381 	movl	C(cachewidth),%ebx
    382 
    383 LSetUp1:
    384 
    385 	addl	%edx,%eax			// add in sstep
    386 								// (tstep >> 16) * cachewidth + (sstep >> 16);
    387 	movl	tfracf,%edx
    388 	movl	%eax,advancetable+4	// advance base in t
    389 	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
    390 								//  (sstep >> 16);
    391 	shll	$12,%ebp			// left-justify sstep fractional part
    392 	movl	sfracf,%ebx
    393 	shll	$12,%ecx			// left-justify tstep fractional part
    394 	movl	%eax,advancetable	// advance extra in t
    395 
    396 	movl	%ecx,tstep
    397 	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
    398 
    399 	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
    400 	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
    401 	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
    402 
    403 	addl	tstep,%edx
    404 	sbbl	%ecx,%ecx
    405 	movb	(%esi),%al
    406 	addl	%ebp,%ebx
    407 	movb	%al,1(%edi)
    408 	adcl	advancetable+4(,%ecx,4),%esi
    409 
    410 	addl	tstep,%edx
    411 	sbbl	%ecx,%ecx
    412 	addl	%ebp,%ebx
    413 	movb	(%esi),%al
    414 	adcl	advancetable+4(,%ecx,4),%esi
    415 
    416 	addl	tstep,%edx
    417 	sbbl	%ecx,%ecx
    418 	movb	%al,2(%edi)
    419 	addl	%ebp,%ebx
    420 	movb	(%esi),%al
    421 	adcl	advancetable+4(,%ecx,4),%esi
    422 
    423 	addl	tstep,%edx
    424 	sbbl	%ecx,%ecx
    425 	movb	%al,3(%edi)
    426 	addl	%ebp,%ebx
    427 	movb	(%esi),%al
    428 	adcl	advancetable+4(,%ecx,4),%esi
    429 
    430 	addl	tstep,%edx
    431 	sbbl	%ecx,%ecx
    432 	movb	%al,4(%edi)
    433 	addl	%ebp,%ebx
    434 	movb	(%esi),%al
    435 	adcl	advancetable+4(,%ecx,4),%esi
    436 
    437 	addl	tstep,%edx
    438 	sbbl	%ecx,%ecx
    439 	movb	%al,5(%edi)
    440 	addl	%ebp,%ebx
    441 	movb	(%esi),%al
    442 	adcl	advancetable+4(,%ecx,4),%esi
    443 
    444 	addl	tstep,%edx
    445 	sbbl	%ecx,%ecx
    446 	movb	%al,6(%edi)
    447 	addl	%ebp,%ebx
    448 	movb	(%esi),%al
    449 	adcl	advancetable+4(,%ecx,4),%esi
    450 
    451 	addl	tstep,%edx
    452 	sbbl	%ecx,%ecx
    453 	movb	%al,7(%edi)
    454 	addl	%ebp,%ebx
    455 	movb	(%esi),%al
    456 	adcl	advancetable+4(,%ecx,4),%esi
    457 
    458 
    459 //
    460 // start FDIV for end of next segment in flight, so it can overlap
    461 //
    462 	movl	counttemp,%ecx
    463 	cmpl	$16,%ecx			// more than one segment after this?
    464 	ja		LSetupNotLast2	// yes
    465 
    466 	decl	%ecx
    467 	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
    468 	movl	%ecx,spancountminus1
    469 	fildl	spancountminus1
    470 
    471 	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
    472 	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
    473 	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
    474 	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
    475 	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
    476 	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
    477 	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
    478 	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
    479 	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
    480 	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
    481 	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
    482 	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
    483 	faddp	%st(0),%st(4)	// 64k
    484 
    485 	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
    486 							//  overlap
    487 	jmp		LFDIVInFlight2
    488 
    489 	.align	4
    490 LSetupNotLast2:
    491 	fadds	zi16stepu
    492 	fxch	%st(2)
    493 	fadds	sdivz16stepu
    494 	fxch	%st(2)
    495 	flds	tdivz16stepu
    496 	faddp	%st(0),%st(2)
    497 	flds	fp_64k
    498 	fdiv	%st(1),%st(0)	// z = 1/1/z
    499 							// this is what we've gone to all this trouble to
    500 							//  overlap
    501 LFDIVInFlight2:
    502 	movl	%ecx,counttemp
    503 
    504 	addl	tstep,%edx
    505 	sbbl	%ecx,%ecx
    506 	movb	%al,8(%edi)
    507 	addl	%ebp,%ebx
    508 	movb	(%esi),%al
    509 	adcl	advancetable+4(,%ecx,4),%esi
    510 
    511 	addl	tstep,%edx
    512 	sbbl	%ecx,%ecx
    513 	movb	%al,9(%edi)
    514 	addl	%ebp,%ebx
    515 	movb	(%esi),%al
    516 	adcl	advancetable+4(,%ecx,4),%esi
    517 
    518 	addl	tstep,%edx
    519 	sbbl	%ecx,%ecx
    520 	movb	%al,10(%edi)
    521 	addl	%ebp,%ebx
    522 	movb	(%esi),%al
    523 	adcl	advancetable+4(,%ecx,4),%esi
    524 
    525 	addl	tstep,%edx
    526 	sbbl	%ecx,%ecx
    527 	movb	%al,11(%edi)
    528 	addl	%ebp,%ebx
    529 	movb	(%esi),%al
    530 	adcl	advancetable+4(,%ecx,4),%esi
    531 
    532 	addl	tstep,%edx
    533 	sbbl	%ecx,%ecx
    534 	movb	%al,12(%edi)
    535 	addl	%ebp,%ebx
    536 	movb	(%esi),%al
    537 	adcl	advancetable+4(,%ecx,4),%esi
    538 
    539 	addl	tstep,%edx
    540 	sbbl	%ecx,%ecx
    541 	movb	%al,13(%edi)
    542 	addl	%ebp,%ebx
    543 	movb	(%esi),%al
    544 	adcl	advancetable+4(,%ecx,4),%esi
    545 
    546 	addl	tstep,%edx
    547 	sbbl	%ecx,%ecx
    548 	movb	%al,14(%edi)
    549 	addl	%ebp,%ebx
    550 	movb	(%esi),%al
    551 	adcl	advancetable+4(,%ecx,4),%esi
    552 
    553 	addl	$16,%edi
    554 	movl	%edx,tfracf
    555 	movl	snext,%edx
    556 	movl	%ebx,sfracf
    557 	movl	tnext,%ebx
    558 	movl	%edx,s
    559 	movl	%ebx,t
    560 
    561 	movl	counttemp,%ecx		// retrieve count
    562 
    563 //
    564 // determine whether last span or not
    565 //
    566 	cmpl	$16,%ecx				// are there multiple segments remaining?
    567 	movb	%al,-1(%edi)
    568 	ja		LNotLastSegment		// yes
    569 
    570 //
    571 // last segment of scan
    572 //
    573 LLastSegment:
    574 
    575 //
    576 // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
    577 // get there. The number of pixels left is variable, and we want to land on the
    578 // last pixel, not step one past it, so we can't run into arithmetic problems
    579 //
    580 	testl	%ecx,%ecx
    581 	jz		LNoSteps		// just draw the last pixel and we're done
    582 
    583 // pick up after the FDIV that was left in flight previously
    584 
    585 
    586 	fld		%st(0)			// duplicate it
    587 	fmul	%st(4),%st(0)	// s = s/z * z
    588 	fxch	%st(1)
    589 	fmul	%st(3),%st(0)	// t = t/z * z
    590 	fxch	%st(1)
    591 	fistpl	snext
    592 	fistpl	tnext
    593 
    594 	movb	(%esi),%al		// load first texel in segment
    595 	movl	C(tadjust),%ebx
    596 	movb	%al,(%edi)		// store first pixel in segment
    597 	movl	C(sadjust),%eax
    598 
    599 	addl	snext,%eax
    600 	addl	tnext,%ebx
    601 
    602 	movl	C(bbextents),%ebp
    603 	movl	C(bbextentt),%edx
    604 
    605 	cmpl	$4096,%eax
    606 	jl		LClampLow4
    607 	cmpl	%ebp,%eax
    608 	ja		LClampHigh4
    609 LClampReentry4:
    610 	movl	%eax,snext
    611 
    612 	cmpl	$4096,%ebx
    613 	jl		LClampLow5
    614 	cmpl	%edx,%ebx
    615 	ja		LClampHigh5
    616 LClampReentry5:
    617 
    618 	cmpl	$1,%ecx			// don't bother
    619 	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
    620 							//  of the segment length
    621 	subl	s,%eax
    622 	subl	t,%ebx
    623 
    624 	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
    625 	addl	%ebx,%ebx		//  reciprocal yields 16.48
    626 
    627 	imull	reciprocal_table_16-8(,%ecx,4)	// sstep = (snext - s) /
    628 											//  (spancount-1)
    629 	movl	%edx,%ebp
    630 
    631 	movl	%ebx,%eax
    632 	imull	reciprocal_table_16-8(,%ecx,4)	// tstep = (tnext - t) /
    633 											//  (spancount-1)
    634 LSetEntryvec:
    635 //
    636 // set up advancetable
    637 //
    638 	movl	entryvec_table_16(,%ecx,4),%ebx
    639 	movl	%edx,%eax
    640 	movl	%ebx,jumptemp		// entry point into code for RET later
    641 	movl	%ebp,%ecx
    642 	sarl	$16,%edx			// tstep >>= 16;
    643 	movl	C(cachewidth),%ebx
    644 	sarl	$16,%ecx			// sstep >>= 16;
    645 	imull	%ebx,%edx
    646 
    647 	addl	%ecx,%edx			// add in sstep
    648 								// (tstep >> 16) * cachewidth + (sstep >> 16);
    649 	movl	tfracf,%ecx
    650 	movl	%edx,advancetable+4	// advance base in t
    651 	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
    652 								//  (sstep >> 16);
    653 	shll	$16,%ebp			// left-justify sstep fractional part
    654 	movl	sfracf,%ebx
    655 	shll	$16,%eax			// left-justify tstep fractional part
    656 	movl	%edx,advancetable	// advance extra in t
    657 
    658 	movl	%eax,tstep
    659 	movl	%ecx,%edx
    660 	addl	%eax,%edx
    661 	sbbl	%ecx,%ecx
    662 	addl	%ebp,%ebx
    663 	adcl	advancetable+4(,%ecx,4),%esi
    664 
    665 	jmp		*jumptemp			// jump to the number-of-pixels handler
    666 
    667 //----------------------------------------
    668 
    669 LNoSteps:
    670 	movb	(%esi),%al		// load first texel in segment
    671 	subl	$15,%edi			// adjust for hardwired offset
    672 	jmp		LEndSpan
    673 
    674 
    675 LOnlyOneStep:
    676 	subl	s,%eax
    677 	subl	t,%ebx
    678 	movl	%eax,%ebp
    679 	movl	%ebx,%edx
    680 	jmp		LSetEntryvec
    681 
    682 //----------------------------------------
    683 
    684 .globl	Entry2_16, Entry3_16, Entry4_16, Entry5_16
    685 .globl	Entry6_16, Entry7_16, Entry8_16, Entry9_16
    686 .globl	Entry10_16, Entry11_16, Entry12_16, Entry13_16
    687 .globl	Entry14_16, Entry15_16, Entry16_16
    688 
    689 Entry2_16:
    690 	subl	$14,%edi		// adjust for hardwired offsets
    691 	movb	(%esi),%al
    692 	jmp		LEntry2_16
    693 
    694 //----------------------------------------
    695 
    696 Entry3_16:
    697 	subl	$13,%edi		// adjust for hardwired offsets
    698 	addl	%eax,%edx
    699 	movb	(%esi),%al
    700 	sbbl	%ecx,%ecx
    701 	addl	%ebp,%ebx
    702 	adcl	advancetable+4(,%ecx,4),%esi
    703 	jmp		LEntry3_16
    704 
    705 //----------------------------------------
    706 
    707 Entry4_16:
    708 	subl	$12,%edi		// adjust for hardwired offsets
    709 	addl	%eax,%edx
    710 	movb	(%esi),%al
    711 	sbbl	%ecx,%ecx
    712 	addl	%ebp,%ebx
    713 	adcl	advancetable+4(,%ecx,4),%esi
    714 	addl	tstep,%edx
    715 	jmp		LEntry4_16
    716 
    717 //----------------------------------------
    718 
    719 Entry5_16:
    720 	subl	$11,%edi		// adjust for hardwired offsets
    721 	addl	%eax,%edx
    722 	movb	(%esi),%al
    723 	sbbl	%ecx,%ecx
    724 	addl	%ebp,%ebx
    725 	adcl	advancetable+4(,%ecx,4),%esi
    726 	addl	tstep,%edx
    727 	jmp		LEntry5_16
    728 
    729 //----------------------------------------
    730 
    731 Entry6_16:
    732 	subl	$10,%edi		// adjust for hardwired offsets
    733 	addl	%eax,%edx
    734 	movb	(%esi),%al
    735 	sbbl	%ecx,%ecx
    736 	addl	%ebp,%ebx
    737 	adcl	advancetable+4(,%ecx,4),%esi
    738 	addl	tstep,%edx
    739 	jmp		LEntry6_16
    740 
    741 //----------------------------------------
    742 
    743 Entry7_16:
    744 	subl	$9,%edi		// adjust for hardwired offsets
    745 	addl	%eax,%edx
    746 	movb	(%esi),%al
    747 	sbbl	%ecx,%ecx
    748 	addl	%ebp,%ebx
    749 	adcl	advancetable+4(,%ecx,4),%esi
    750 	addl	tstep,%edx
    751 	jmp		LEntry7_16
    752 
    753 //----------------------------------------
    754 
    755 Entry8_16:
    756 	subl	$8,%edi		// adjust for hardwired offsets
    757 	addl	%eax,%edx
    758 	movb	(%esi),%al
    759 	sbbl	%ecx,%ecx
    760 	addl	%ebp,%ebx
    761 	adcl	advancetable+4(,%ecx,4),%esi
    762 	addl	tstep,%edx
    763 	jmp		LEntry8_16
    764 
    765 //----------------------------------------
    766 
    767 Entry9_16:
    768 	subl	$7,%edi		// adjust for hardwired offsets
    769 	addl	%eax,%edx
    770 	movb	(%esi),%al
    771 	sbbl	%ecx,%ecx
    772 	addl	%ebp,%ebx
    773 	adcl	advancetable+4(,%ecx,4),%esi
    774 	addl	tstep,%edx
    775 	jmp		LEntry9_16
    776 
    777 //----------------------------------------
    778 
    779 Entry10_16:
    780 	subl	$6,%edi		// adjust for hardwired offsets
    781 	addl	%eax,%edx
    782 	movb	(%esi),%al
    783 	sbbl	%ecx,%ecx
    784 	addl	%ebp,%ebx
    785 	adcl	advancetable+4(,%ecx,4),%esi
    786 	addl	tstep,%edx
    787 	jmp		LEntry10_16
    788 
    789 //----------------------------------------
    790 
    791 Entry11_16:
    792 	subl	$5,%edi		// adjust for hardwired offsets
    793 	addl	%eax,%edx
    794 	movb	(%esi),%al
    795 	sbbl	%ecx,%ecx
    796 	addl	%ebp,%ebx
    797 	adcl	advancetable+4(,%ecx,4),%esi
    798 	addl	tstep,%edx
    799 	jmp		LEntry11_16
    800 
    801 //----------------------------------------
    802 
    803 Entry12_16:
    804 	subl	$4,%edi		// adjust for hardwired offsets
    805 	addl	%eax,%edx
    806 	movb	(%esi),%al
    807 	sbbl	%ecx,%ecx
    808 	addl	%ebp,%ebx
    809 	adcl	advancetable+4(,%ecx,4),%esi
    810 	addl	tstep,%edx
    811 	jmp		LEntry12_16
    812 
    813 //----------------------------------------
    814 
    815 Entry13_16:
    816 	subl	$3,%edi		// adjust for hardwired offsets
    817 	addl	%eax,%edx
    818 	movb	(%esi),%al
    819 	sbbl	%ecx,%ecx
    820 	addl	%ebp,%ebx
    821 	adcl	advancetable+4(,%ecx,4),%esi
    822 	addl	tstep,%edx
    823 	jmp		LEntry13_16
    824 
    825 //----------------------------------------
    826 
    827 Entry14_16:
    828 	subl	$2,%edi		// adjust for hardwired offsets
    829 	addl	%eax,%edx
    830 	movb	(%esi),%al
    831 	sbbl	%ecx,%ecx
    832 	addl	%ebp,%ebx
    833 	adcl	advancetable+4(,%ecx,4),%esi
    834 	addl	tstep,%edx
    835 	jmp		LEntry14_16
    836 
    837 //----------------------------------------
    838 
    839 Entry15_16:
    840 	decl	%edi		// adjust for hardwired offsets
    841 	addl	%eax,%edx
    842 	movb	(%esi),%al
    843 	sbbl	%ecx,%ecx
    844 	addl	%ebp,%ebx
    845 	adcl	advancetable+4(,%ecx,4),%esi
    846 	addl	tstep,%edx
    847 	jmp		LEntry15_16
    848 
    849 //----------------------------------------
    850 
    851 Entry16_16:
    852 	addl	%eax,%edx
    853 	movb	(%esi),%al
    854 	sbbl	%ecx,%ecx
    855 	addl	%ebp,%ebx
    856 	adcl	advancetable+4(,%ecx,4),%esi
    857 
    858 	addl	tstep,%edx
    859 	sbbl	%ecx,%ecx
    860 	movb	%al,1(%edi)
    861 	addl	%ebp,%ebx
    862 	movb	(%esi),%al
    863 	adcl	advancetable+4(,%ecx,4),%esi
    864 	addl	tstep,%edx
    865 LEntry15_16:
    866 	sbbl	%ecx,%ecx
    867 	movb	%al,2(%edi)
    868 	addl	%ebp,%ebx
    869 	movb	(%esi),%al
    870 	adcl	advancetable+4(,%ecx,4),%esi
    871 	addl	tstep,%edx
    872 LEntry14_16:
    873 	sbbl	%ecx,%ecx
    874 	movb	%al,3(%edi)
    875 	addl	%ebp,%ebx
    876 	movb	(%esi),%al
    877 	adcl	advancetable+4(,%ecx,4),%esi
    878 	addl	tstep,%edx
    879 LEntry13_16:
    880 	sbbl	%ecx,%ecx
    881 	movb	%al,4(%edi)
    882 	addl	%ebp,%ebx
    883 	movb	(%esi),%al
    884 	adcl	advancetable+4(,%ecx,4),%esi
    885 	addl	tstep,%edx
    886 LEntry12_16:
    887 	sbbl	%ecx,%ecx
    888 	movb	%al,5(%edi)
    889 	addl	%ebp,%ebx
    890 	movb	(%esi),%al
    891 	adcl	advancetable+4(,%ecx,4),%esi
    892 	addl	tstep,%edx
    893 LEntry11_16:
    894 	sbbl	%ecx,%ecx
    895 	movb	%al,6(%edi)
    896 	addl	%ebp,%ebx
    897 	movb	(%esi),%al
    898 	adcl	advancetable+4(,%ecx,4),%esi
    899 	addl	tstep,%edx
    900 LEntry10_16:
    901 	sbbl	%ecx,%ecx
    902 	movb	%al,7(%edi)
    903 	addl	%ebp,%ebx
    904 	movb	(%esi),%al
    905 	adcl	advancetable+4(,%ecx,4),%esi
    906 	addl	tstep,%edx
    907 LEntry9_16:
    908 	sbbl	%ecx,%ecx
    909 	movb	%al,8(%edi)
    910 	addl	%ebp,%ebx
    911 	movb	(%esi),%al
    912 	adcl	advancetable+4(,%ecx,4),%esi
    913 	addl	tstep,%edx
    914 LEntry8_16:
    915 	sbbl	%ecx,%ecx
    916 	movb	%al,9(%edi)
    917 	addl	%ebp,%ebx
    918 	movb	(%esi),%al
    919 	adcl	advancetable+4(,%ecx,4),%esi
    920 	addl	tstep,%edx
    921 LEntry7_16:
    922 	sbbl	%ecx,%ecx
    923 	movb	%al,10(%edi)
    924 	addl	%ebp,%ebx
    925 	movb	(%esi),%al
    926 	adcl	advancetable+4(,%ecx,4),%esi
    927 	addl	tstep,%edx
    928 LEntry6_16:
    929 	sbbl	%ecx,%ecx
    930 	movb	%al,11(%edi)
    931 	addl	%ebp,%ebx
    932 	movb	(%esi),%al
    933 	adcl	advancetable+4(,%ecx,4),%esi
    934 	addl	tstep,%edx
    935 LEntry5_16:
    936 	sbbl	%ecx,%ecx
    937 	movb	%al,12(%edi)
    938 	addl	%ebp,%ebx
    939 	movb	(%esi),%al
    940 	adcl	advancetable+4(,%ecx,4),%esi
    941 	addl	tstep,%edx
    942 LEntry4_16:
    943 	sbbl	%ecx,%ecx
    944 	movb	%al,13(%edi)
    945 	addl	%ebp,%ebx
    946 	movb	(%esi),%al
    947 	adcl	advancetable+4(,%ecx,4),%esi
    948 LEntry3_16:
    949 	movb	%al,14(%edi)
    950 	movb	(%esi),%al
    951 LEntry2_16:
    952 
    953 LEndSpan:
    954 
    955 //
    956 // clear s/z, t/z, 1/z from FP stack
    957 //
    958 	fstp %st(0)
    959 	fstp %st(0)
    960 	fstp %st(0)
    961 
    962 	movl	pspantemp,%ebx				// restore spans pointer
    963 	movl	espan_t_pnext(%ebx),%ebx	// point to next span
    964 	testl	%ebx,%ebx			// any more spans?
    965 	movb	%al,15(%edi)
    966 	jnz		LSpanLoop			// more spans
    967 
    968 	popl	%ebx				// restore register variables
    969 	popl	%esi
    970 	popl	%edi
    971 	popl	%ebp				// restore the caller's stack frame
    972 	ret
    973 
    974 #endif	// id386
    975