Home | History | Annotate | Download | only in asm
      1 /* Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
      2 
      3 Permission is hereby granted, free of charge, to any person obtaining
      4 a copy of this software and associated documentation files (the
      5 "Software"), to deal in the Software without restriction, including
      6 without limitation the rights to use, copy, modify, merge, publish,
      7 distribute, sublicense, and/or sell copies of the Software, and to
      8 permit persons to whom the Software is furnished to do so, subject to
      9 the following conditions:
     10 
     11 The above copyright notice and this permission notice shall be
     12 included in all copies or substantial portions of the Software.
     13 
     14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     15 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     16 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     17 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
     18 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     19 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
     20 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
     21 
     22 //	Common registers are assigned as follows:
     23 //
     24 //	COMMON
     25 //
     26 //	t0		Const Tbl Ptr	TPtr
     27 //	t1		Round Constant	TRound
     28 //	t4		Block residual	LenResid
     29 //	t5		Residual Data	DTmp
     30 //
     31 //	{in,out}0	Block 0 Cycle	RotateM0
     32 //	{in,out}1	Block Value 12	M12
     33 //	{in,out}2	Block Value 8	M8
     34 //	{in,out}3	Block Value 4	M4
     35 //	{in,out}4	Block Value 0	M0
     36 //	{in,out}5	Block 1 Cycle	RotateM1
     37 //	{in,out}6	Block Value 13	M13
     38 //	{in,out}7	Block Value 9	M9
     39 //	{in,out}8	Block Value 5	M5
     40 //	{in,out}9	Block Value 1	M1
     41 //	{in,out}10	Block 2 Cycle	RotateM2
     42 //	{in,out}11	Block Value 14	M14
     43 //	{in,out}12	Block Value 10	M10
     44 //	{in,out}13	Block Value 6	M6
     45 //	{in,out}14	Block Value 2	M2
     46 //	{in,out}15	Block 3 Cycle	RotateM3
     47 //	{in,out}16	Block Value 15	M15
     48 //	{in,out}17	Block Value 11	M11
     49 //	{in,out}18	Block Value 7	M7
     50 //	{in,out}19	Block Value 3	M3
     51 //	{in,out}20	Scratch			Z
     52 //	{in,out}21	Scratch			Y
     53 //	{in,out}22	Scratch			X
     54 //	{in,out}23	Scratch			W
     55 //	{in,out}24	Digest A		A
     56 //	{in,out}25	Digest B		B
     57 //	{in,out}26	Digest C		C
     58 //	{in,out}27	Digest D		D
     59 //	{in,out}28	Active Data Ptr	DPtr
     60 //	in28		Dummy Value		-
     61 //	out28		Dummy Value		-
     62 //	bt0			Coroutine Link	QUICK_RTN
     63 //
     64 ///	These predicates are used for computing the padding block(s) and
     65 ///	are shared between the driver and digest co-routines
     66 //
     67 //	pt0			Extra Pad Block	pExtra
     68 //	pt1			Load next word	pLoad
     69 //	pt2			Skip next word	pSkip
     70 //	pt3			Search for Pad	pNoPad
     71 //	pt4			Pad Word 0		pPad0
     72 //	pt5			Pad Word 1		pPad1
     73 //	pt6			Pad Word 2		pPad2
     74 //	pt7			Pad Word 3		pPad3
     75 
     76 #define	DTmp		r19
     77 #define	LenResid	r18
     78 #define	QUICK_RTN	b6
     79 #define	TPtr		r14
     80 #define	TRound		r15
     81 #define	pExtra		p6
     82 #define	pLoad		p7
     83 #define	pNoPad		p9
     84 #define	pPad0		p10
     85 #define	pPad1		p11
     86 #define	pPad2		p12
     87 #define	pPad3		p13
     88 #define	pSkip		p8
     89 
     90 #define	A_		out24
     91 #define	B_		out25
     92 #define	C_		out26
     93 #define	D_		out27
     94 #define	DPtr_		out28
     95 #define	M0_		out4
     96 #define	M1_		out9
     97 #define	M10_		out12
     98 #define	M11_		out17
     99 #define	M12_		out1
    100 #define	M13_		out6
    101 #define	M14_		out11
    102 #define	M15_		out16
    103 #define	M2_		out14
    104 #define	M3_		out19
    105 #define	M4_		out3
    106 #define	M5_		out8
    107 #define	M6_		out13
    108 #define	M7_		out18
    109 #define	M8_		out2
    110 #define	M9_		out7
    111 #define	RotateM0_	out0
    112 #define	RotateM1_	out5
    113 #define	RotateM2_	out10
    114 #define	RotateM3_	out15
    115 #define	W_		out23
    116 #define	X_		out22
    117 #define	Y_		out21
    118 #define	Z_		out20
    119 
    120 #define	A		in24
    121 #define	B		in25
    122 #define	C		in26
    123 #define	D		in27
    124 #define	DPtr		in28
    125 #define	M0		in4
    126 #define	M1		in9
    127 #define	M10		in12
    128 #define	M11		in17
    129 #define	M12		in1
    130 #define	M13		in6
    131 #define	M14		in11
    132 #define	M15		in16
    133 #define	M2		in14
    134 #define	M3		in19
    135 #define	M4		in3
    136 #define	M5		in8
    137 #define	M6		in13
    138 #define	M7		in18
    139 #define	M8		in2
    140 #define	M9		in7
    141 #define	RotateM0	in0
    142 #define	RotateM1	in5
    143 #define	RotateM2	in10
    144 #define	RotateM3	in15
    145 #define	W		in23
    146 #define	X		in22
    147 #define	Y		in21
    148 #define	Z		in20
    149 
    150 /* register stack configuration for md5_block_asm_data_order(): */
    151 #define	MD5_NINP	3
    152 #define	MD5_NLOC	0
    153 #define MD5_NOUT	29
    154 #define MD5_NROT	0
    155 
    156 /* register stack configuration for helpers: */
    157 #define	_NINPUTS	MD5_NOUT
    158 #define	_NLOCALS	0
    159 #define _NOUTPUT	0
    160 #define	_NROTATE	24	/* this must be <= _NINPUTS */
    161 
    162 #if defined(_HPUX_SOURCE) && !defined(_LP64)
    163 #define	ADDP	addp4
    164 #else
    165 #define	ADDP	add
    166 #endif
    167 
    168 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
    169 #define HOST_IS_BIG_ENDIAN
    170 #endif
    171 
    172 //	Macros for getting the left and right portions of little-endian words
    173 
    174 #define	GETLW(dst, src, align)	dep.z dst = src, 32 - 8 * align, 8 * align
    175 #define	GETRW(dst, src, align)	extr.u dst = src, 8 * align, 32 - 8 * align
    176 
    177 //	MD5 driver
    178 //
    179 //		Reads an input block, then calls the digest block
    180 //		subroutine and adds the results to the accumulated
    181 //		digest.  It allocates 32 outs which the subroutine
    182 //		uses as it's inputs and rotating
    183 //		registers. Initializes the round constant pointer and
    184 //		takes care of saving/restoring ar.lc
    185 //
    186 ///	INPUT
    187 //
    188 //	in0		Context Ptr		CtxPtr0
    189 //	in1		Input Data Ptr		DPtrIn
    190 //	in2		Integral Blocks		BlockCount
    191 //	rp		Return Address		-
    192 //
    193 ///	CODE
    194 //
    195 //	v2		Input Align		InAlign
    196 //	t0		Shared w/digest		-
    197 //	t1		Shared w/digest		-
    198 //	t2		Shared w/digest		-
    199 //	t3		Shared w/digest		-
    200 //	t4		Shared w/digest		-
    201 //	t5		Shared w/digest		-
    202 //	t6		PFS Save		PFSSave
    203 //	t7		ar.lc Save		LCSave
    204 //	t8		Saved PR		PRSave
    205 //	t9		2nd CtxPtr		CtxPtr1
    206 //	t10		Table Base		CTable
    207 //	t11		Table[0]		CTable0
    208 //	t13		Accumulator A		AccumA
    209 //	t14		Accumulator B		AccumB
    210 //	t15		Accumulator C		AccumC
    211 //	t16		Accumulator D		AccumD
    212 //	pt0		Shared w/digest		-
    213 //	pt1		Shared w/digest		-
    214 //	pt2		Shared w/digest		-
    215 //	pt3		Shared w/digest		-
    216 //	pt4		Shared w/digest		-
    217 //	pt5		Shared w/digest		-
    218 //	pt6		Shared w/digest		-
    219 //	pt7		Shared w/digest		-
    220 //	pt8		Not Aligned		pOff
    221 //	pt8		Blocks Left		pAgain
    222 
    223 #define	AccumA		r27
    224 #define	AccumB		r28
    225 #define	AccumC		r29
    226 #define	AccumD		r30
    227 #define	CTable		r24
    228 #define	CTable0		r25
    229 #define	CtxPtr0		in0
    230 #define	CtxPtr1		r23
    231 #define	DPtrIn		in1
    232 #define	BlockCount	in2
    233 #define	InAlign		r10
    234 #define	LCSave		r21
    235 #define	PFSSave		r20
    236 #define	PRSave		r22
    237 #define	pAgain		p63
    238 #define	pOff		p63
    239 
    240 	.text
    241 
    242 /* md5_block_asm_data_order(MD5_CTX *c, const void *data, size_t num)
    243 
    244      where:
    245       c: a pointer to a structure of this type:
    246 
    247 	   typedef struct MD5state_st
    248 	     {
    249 	       MD5_LONG A,B,C,D;
    250 	       MD5_LONG Nl,Nh;
    251 	       MD5_LONG data[MD5_LBLOCK];
    252 	       unsigned int num;
    253 	     }
    254 	   MD5_CTX;
    255 
    256       data: a pointer to the input data (may be misaligned)
    257       num:  the number of 16-byte blocks to hash (i.e., the length
    258             of DATA is 16*NUM.
    259 
    260    */
    261 
    262 	.type	md5_block_asm_data_order, @function
    263 	.global	md5_block_asm_data_order
    264 	.align	32
    265 	.proc	md5_block_asm_data_order
    266 md5_block_asm_data_order:
    267 .md5_block:
    268 	.prologue
    269 {	.mmi
    270 	.save	ar.pfs, PFSSave
    271 	alloc	PFSSave = ar.pfs, MD5_NINP, MD5_NLOC, MD5_NOUT, MD5_NROT
    272 	ADDP	CtxPtr1 = 8, CtxPtr0
    273 	mov	CTable = ip
    274 }
    275 {	.mmi
    276 	ADDP	DPtrIn = 0, DPtrIn
    277 	ADDP	CtxPtr0 = 0, CtxPtr0
    278 	.save	ar.lc, LCSave
    279 	mov	LCSave = ar.lc
    280 }
    281 ;;
    282 {	.mmi
    283 	add	CTable = .md5_tbl_data_order#-.md5_block#, CTable
    284 	and	InAlign = 0x3, DPtrIn
    285 }
    286 
    287 {	.mmi
    288 	ld4	AccumA = [CtxPtr0], 4
    289 	ld4	AccumC = [CtxPtr1], 4
    290 	.save pr, PRSave
    291 	mov	PRSave = pr
    292 	.body
    293 }
    294 ;;
    295 {	.mmi
    296 	ld4	AccumB = [CtxPtr0]
    297 	ld4	AccumD = [CtxPtr1]
    298 	dep	DPtr_ = 0, DPtrIn, 0, 2
    299 } ;;
    300 #ifdef HOST_IS_BIG_ENDIAN
    301 	rum	psr.be;;	// switch to little-endian
    302 #endif
    303 {	.mmb
    304 	ld4	CTable0 = [CTable], 4
    305 	cmp.ne	pOff, p0 = 0, InAlign
    306 (pOff)	br.cond.spnt.many .md5_unaligned
    307 } ;;
    308 
    309 //	The FF load/compute loop rotates values three times, so that
    310 //	loading into M12 here produces the M0 value, M13 -> M1, etc.
    311 
    312 .md5_block_loop0:
    313 {	.mmi
    314 	ld4	M12_ = [DPtr_], 4
    315 	mov	TPtr = CTable
    316 	mov	TRound = CTable0
    317 } ;;
    318 {	.mmi
    319 	ld4	M13_ = [DPtr_], 4
    320 	mov	A_ = AccumA
    321 	mov	B_ = AccumB
    322 } ;;
    323 {	.mmi
    324 	ld4	M14_ = [DPtr_], 4
    325 	mov	C_ = AccumC
    326 	mov	D_ = AccumD
    327 } ;;
    328 {	.mmb
    329 	ld4	M15_ = [DPtr_], 4
    330 	add	BlockCount = -1, BlockCount
    331 	br.call.sptk.many QUICK_RTN = md5_digest_block0
    332 } ;;
    333 
    334 //	Now, we add the new digest values and do some clean-up
    335 //	before checking if there's another full block to process
    336 
    337 {	.mmi
    338 	add	AccumA = AccumA, A_
    339 	add	AccumB = AccumB, B_
    340 	cmp.ne	pAgain, p0 = 0, BlockCount
    341 }
    342 {	.mib
    343 	add	AccumC = AccumC, C_
    344 	add	AccumD = AccumD, D_
    345 (pAgain) br.cond.dptk.many .md5_block_loop0
    346 } ;;
    347 
    348 .md5_exit:
    349 #ifdef HOST_IS_BIG_ENDIAN
    350 	sum	psr.be;;	// switch back to big-endian mode
    351 #endif
    352 {	.mmi
    353 	st4	[CtxPtr0] = AccumB, -4
    354 	st4	[CtxPtr1] = AccumD, -4
    355 	mov	pr = PRSave, 0x1ffff ;;
    356 }
    357 {	.mmi
    358 	st4	[CtxPtr0] = AccumA
    359 	st4	[CtxPtr1] = AccumC
    360 	mov	ar.lc = LCSave
    361 } ;;
    362 {	.mib
    363 	mov	ar.pfs = PFSSave
    364 	br.ret.sptk.few	rp
    365 } ;;
    366 
    367 #define	MD5UNALIGNED(offset)						\
    368 .md5_process##offset:							\
    369 {	.mib ;								\
    370 	nop	0x0	;						\
    371 	GETRW(DTmp, DTmp, offset) ;					\
    372 } ;;									\
    373 .md5_block_loop##offset:						\
    374 {	.mmi ;								\
    375 	ld4	Y_ = [DPtr_], 4 ;					\
    376 	mov	TPtr = CTable ;						\
    377 	mov	TRound = CTable0 ;					\
    378 } ;;									\
    379 {	.mmi ;								\
    380 	ld4	M13_ = [DPtr_], 4 ;					\
    381 	mov	A_ = AccumA ;						\
    382 	mov	B_ = AccumB ;						\
    383 } ;;									\
    384 {	.mii ;								\
    385 	ld4	M14_ = [DPtr_], 4 ;					\
    386 	GETLW(W_, Y_, offset) ;						\
    387 	mov	C_ = AccumC ;						\
    388 }									\
    389 {	.mmi ;								\
    390 	mov	D_ = AccumD ;;						\
    391 	or	M12_ = W_, DTmp ;					\
    392 	GETRW(DTmp, Y_, offset) ;					\
    393 }									\
    394 {	.mib ;								\
    395 	ld4	M15_ = [DPtr_], 4 ;					\
    396 	add	BlockCount = -1, BlockCount ;				\
    397 	br.call.sptk.many QUICK_RTN = md5_digest_block##offset;		\
    398 } ;;									\
    399 {	.mmi ;								\
    400 	add	AccumA = AccumA, A_ ;					\
    401 	add	AccumB = AccumB, B_ ;					\
    402 	cmp.ne	pAgain, p0 = 0, BlockCount ;				\
    403 }									\
    404 {	.mib ;								\
    405 	add	AccumC = AccumC, C_ ;					\
    406 	add	AccumD = AccumD, D_ ;					\
    407 (pAgain) br.cond.dptk.many .md5_block_loop##offset ;			\
    408 } ;;									\
    409 {	.mib ;								\
    410 	nop	0x0 ;							\
    411 	nop	0x0 ;							\
    412 	br.cond.sptk.many .md5_exit ;					\
    413 } ;;
    414 
    415 	.align	32
    416 .md5_unaligned:
    417 //
    418 //	Because variable shifts are expensive, we special case each of
    419 //	the four alignements. In practice, this won't hurt too much
    420 //	since only one working set of code will be loaded.
    421 //
    422 {	.mib
    423 	ld4	DTmp = [DPtr_], 4
    424 	cmp.eq	pOff, p0 = 1, InAlign
    425 (pOff)	br.cond.dpnt.many .md5_process1
    426 } ;;
    427 {	.mib
    428 	cmp.eq	pOff, p0 = 2, InAlign
    429 	nop	0x0
    430 (pOff)	br.cond.dpnt.many .md5_process2
    431 } ;;
    432 	MD5UNALIGNED(3)
    433 	MD5UNALIGNED(1)
    434 	MD5UNALIGNED(2)
    435 
    436 	.endp md5_block_asm_data_order
    437 
    438 
    439 // MD5 Perform the F function and load
    440 //
    441 // Passed the first 4 words (M0 - M3) and initial (A, B, C, D) values,
    442 // computes the FF() round of functions, then branches to the common
    443 // digest code to finish up with GG(), HH, and II().
    444 //
    445 // INPUT
    446 //
    447 // rp Return Address -
    448 //
    449 // CODE
    450 //
    451 // v0 PFS bit bucket PFS
    452 // v1 Loop Trip Count LTrip
    453 // pt0 Load next word pMore
    454 
    455 /* For F round: */
    456 #define LTrip	r9
    457 #define PFS	r8
    458 #define pMore	p6
    459 
    460 /* For GHI rounds: */
    461 #define T	r9
    462 #define U	r10
    463 #define V	r11
    464 
    465 #define COMPUTE(a, b, s, M, R)			\
    466 {						\
    467 	.mii ;					\
    468 	ld4 TRound = [TPtr], 4 ;		\
    469 	dep.z Y = Z, 32, 32 ;;			\
    470 	shrp Z = Z, Y, 64 - s ;			\
    471 } ;;						\
    472 {						\
    473 	.mmi ;					\
    474 	add a = Z, b ;				\
    475 	mov R = M ;				\
    476 	nop 0x0 ;				\
    477 } ;;
    478 
    479 #define LOOP(a, b, s, M, R, label)		\
    480 {	.mii ;					\
    481 	ld4 TRound = [TPtr], 4 ;		\
    482 	dep.z Y = Z, 32, 32 ;;			\
    483 	shrp Z = Z, Y, 64 - s ;			\
    484 } ;;						\
    485 {	.mib ;					\
    486 	add a = Z, b ;				\
    487 	mov R = M ;				\
    488 	br.ctop.sptk.many label ;		\
    489 } ;;
    490 
    491 // G(B, C, D) = (B & D) | (C & ~D)
    492 
    493 #define G(a, b, c, d, M)			\
    494 {	.mmi ;					\
    495 	add Z = M, TRound ;			\
    496 	and Y = b, d ;				\
    497 	andcm X = c, d ;			\
    498 } ;;						\
    499 {	.mii ;					\
    500 	add Z = Z, a ;				\
    501 	or Y = Y, X ;;				\
    502 	add Z = Z, Y ;				\
    503 } ;;
    504 
    505 // H(B, C, D) = B ^ C ^ D
    506 
    507 #define H(a, b, c, d, M)			\
    508 {	.mmi ;					\
    509 	add Z = M, TRound ;			\
    510 	xor Y = b, c ;				\
    511 	nop 0x0 ;				\
    512 } ;;						\
    513 {	.mii ;					\
    514 	add Z = Z, a ;				\
    515 	xor Y = Y, d ;;				\
    516 	add Z = Z, Y ;				\
    517 } ;;
    518 
    519 // I(B, C, D) = C ^ (B | ~D)
    520 //
    521 // However, since we have an andcm operator, we use the fact that
    522 //
    523 // Y ^ Z == ~Y ^ ~Z
    524 //
    525 // to rewrite the expression as
    526 //
    527 // I(B, C, D) = ~C ^ (~B & D)
    528 
    529 #define I(a, b, c, d, M)			\
    530 {	.mmi ;					\
    531 	add Z = M, TRound ;			\
    532 	andcm Y = d, b ;			\
    533 	andcm X = -1, c ;			\
    534 } ;;						\
    535 {	.mii ;					\
    536 	add Z = Z, a ;				\
    537 	xor Y = Y, X ;;				\
    538 	add Z = Z, Y ;				\
    539 } ;;
    540 
    541 #define GG4(label)				\
    542 	G(A, B, C, D, M0)			\
    543 	COMPUTE(A, B, 5, M0, RotateM0)		\
    544 	G(D, A, B, C, M1)			\
    545 	COMPUTE(D, A, 9, M1, RotateM1)		\
    546 	G(C, D, A, B, M2)			\
    547 	COMPUTE(C, D, 14, M2, RotateM2)		\
    548 	G(B, C, D, A, M3)			\
    549 	LOOP(B, C, 20, M3, RotateM3, label)
    550 
    551 #define HH4(label)				\
    552 	H(A, B, C, D, M0)			\
    553 	COMPUTE(A, B, 4, M0, RotateM0)		\
    554 	H(D, A, B, C, M1)			\
    555 	COMPUTE(D, A, 11, M1, RotateM1)		\
    556 	H(C, D, A, B, M2)			\
    557 	COMPUTE(C, D, 16, M2, RotateM2)		\
    558 	H(B, C, D, A, M3)			\
    559 	LOOP(B, C, 23, M3, RotateM3, label)
    560 
    561 #define II4(label)				\
    562 	I(A, B, C, D, M0)			\
    563 	COMPUTE(A, B, 6, M0, RotateM0)		\
    564 	I(D, A, B, C, M1)			\
    565 	COMPUTE(D, A, 10, M1, RotateM1)		\
    566 	I(C, D, A, B, M2)			\
    567 	COMPUTE(C, D, 15, M2, RotateM2)		\
    568 	I(B, C, D, A, M3)			\
    569 	LOOP(B, C, 21, M3, RotateM3, label)
    570 
    571 #define FFLOAD(a, b, c, d, M, N, s)		\
    572 {	.mii ;					\
    573 (pMore) ld4 N = [DPtr], 4 ;			\
    574 	add Z = M, TRound ;			\
    575 	and Y = c, b ;				\
    576 }						\
    577 {	.mmi ;					\
    578 	andcm X = d, b ;;			\
    579 	add Z = Z, a ;				\
    580 	or Y = Y, X ;				\
    581 } ;;						\
    582 {	.mii ;					\
    583 	ld4 TRound = [TPtr], 4 ;		\
    584 	add Z = Z, Y ;;				\
    585 	dep.z Y = Z, 32, 32 ;			\
    586 } ;;						\
    587 {	.mii ;					\
    588 	nop 0x0 ;				\
    589 	shrp Z = Z, Y, 64 - s ;;		\
    590 	add a = Z, b ;				\
    591 } ;;
    592 
    593 #define FFLOOP(a, b, c, d, M, N, s, dest)	\
    594 {	.mii ;					\
    595 (pMore)	ld4 N = [DPtr], 4 ;			\
    596 	add Z = M, TRound ;			\
    597 	and Y = c, b ;				\
    598 }						\
    599 {	.mmi ;					\
    600 	andcm X = d, b ;;			\
    601 	add Z = Z, a ;				\
    602 	or Y = Y, X ;				\
    603 } ;;						\
    604 {	.mii ;					\
    605 	ld4 TRound = [TPtr], 4 ;		\
    606 	add Z = Z, Y ;;				\
    607 	dep.z Y = Z, 32, 32 ;			\
    608 } ;;						\
    609 {	.mii ;					\
    610 	nop 0x0 ;				\
    611 	shrp Z = Z, Y, 64 - s ;;		\
    612 	add a = Z, b ;				\
    613 }						\
    614 {	.mib ;					\
    615 	cmp.ne pMore, p0 = 0, LTrip ;		\
    616 	add LTrip = -1, LTrip ;			\
    617 	br.ctop.dptk.many dest ;		\
    618 } ;;
    619 
    620 	.type md5_digest_block0, @function
    621 	.align 32
    622 
    623 	.proc md5_digest_block0
    624 	.prologue
    625 md5_digest_block0:
    626 	.altrp QUICK_RTN
    627 	.body
    628 {	.mmi
    629 	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
    630 	mov LTrip = 2
    631 	mov ar.lc = 3
    632 } ;;
    633 {	.mii
    634 	cmp.eq pMore, p0 = r0, r0
    635 	mov ar.ec = 0
    636 	nop 0x0
    637 } ;;
    638 
    639 .md5_FF_round0:
    640 	FFLOAD(A, B, C, D, M12, RotateM0, 7)
    641 	FFLOAD(D, A, B, C, M13, RotateM1, 12)
    642 	FFLOAD(C, D, A, B, M14, RotateM2, 17)
    643 	FFLOOP(B, C, D, A, M15, RotateM3, 22, .md5_FF_round0)
    644 	//
    645 	// !!! Fall through to md5_digest_GHI
    646 	//
    647 	.endp md5_digest_block0
    648 
    649 	.type md5_digest_GHI, @function
    650 	.align 32
    651 
    652 	.proc md5_digest_GHI
    653 	.prologue
    654 	.regstk _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
    655 md5_digest_GHI:
    656 	.altrp QUICK_RTN
    657 	.body
    658 //
    659 // The following sequence shuffles the block counstants round for the
    660 // next round:
    661 //
    662 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
    663 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
    664 //
    665 {	.mmi
    666 	mov Z = M0
    667 	mov Y = M15
    668 	mov ar.lc = 3
    669 }
    670 {	.mmi
    671 	mov X = M2
    672 	mov W = M9
    673 	mov V = M4
    674 } ;;
    675 
    676 {	.mmi
    677 	mov M0 = M1
    678 	mov M15 = M12
    679 	mov ar.ec = 1
    680 }
    681 {	.mmi
    682 	mov M2 = M11
    683 	mov M9 = M14
    684 	mov M4 = M5
    685 } ;;
    686 
    687 {	.mmi
    688 	mov M1 = M6
    689 	mov M12 = M13
    690 	mov U = M3
    691 }
    692 {	.mmi
    693 	mov M11 = M8
    694 	mov M14 = M7
    695 	mov M5 = M10
    696 } ;;
    697 
    698 {	.mmi
    699 	mov M6 = Y
    700 	mov M13 = X
    701 	mov M3 = Z
    702 }
    703 {	.mmi
    704 	mov M8 = W
    705 	mov M7 = V
    706 	mov M10 = U
    707 } ;;
    708 
    709 .md5_GG_round:
    710 	GG4(.md5_GG_round)
    711 
    712 // The following sequence shuffles the block constants round for the
    713 // next round:
    714 //
    715 // 1 6 11 0 5 10 14 4 9 14 3 8 13 2 7 12
    716 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
    717 
    718 {	.mmi
    719 	mov Z = M0
    720 	mov Y = M1
    721 	mov ar.lc = 3
    722 }
    723 {	.mmi
    724 	mov X = M3
    725 	mov W = M5
    726 	mov V = M6
    727 } ;;
    728 
    729 {	.mmi
    730 	mov M0 = M4
    731 	mov M1 = M11
    732 	mov ar.ec = 1
    733 }
    734 {	.mmi
    735 	mov M3 = M9
    736 	mov U = M8
    737 	mov T = M13
    738 } ;;
    739 
    740 {	.mmi
    741 	mov M4 = Z
    742 	mov M11 = Y
    743 	mov M5 = M7
    744 }
    745 {	.mmi
    746 	mov M6 = M14
    747 	mov M8 = M12
    748 	mov M13 = M15
    749 } ;;
    750 
    751 {	.mmi
    752 	mov M7 = W
    753 	mov M14 = V
    754 	nop 0x0
    755 }
    756 {	.mmi
    757 	mov M9 = X
    758 	mov M12 = U
    759 	mov M15 = T
    760 } ;;
    761 
    762 .md5_HH_round:
    763 	HH4(.md5_HH_round)
    764 
    765 // The following sequence shuffles the block constants round for the
    766 // next round:
    767 //
    768 // 5 8 11 14 1 4 7 10 13 0 3 6 9 12 15 2
    769 // 0 7 14 5 12 3 10 1 8 15 6 13 4 11 2 9
    770 
    771 {	.mmi
    772 	mov Z = M0
    773 	mov Y = M15
    774 	mov ar.lc = 3
    775 }
    776 {	.mmi
    777 	mov X = M10
    778 	mov W = M1
    779 	mov V = M4
    780 } ;;
    781 
    782 {	.mmi
    783 	mov M0 = M9
    784 	mov M15 = M12
    785 	mov ar.ec = 1
    786 }
    787 {	.mmi
    788 	mov M10 = M11
    789 	mov M1 = M6
    790 	mov M4 = M13
    791 } ;;
    792 
    793 {	.mmi
    794 	mov M9 = M14
    795 	mov M12 = M5
    796 	mov U = M3
    797 }
    798 {	.mmi
    799 	mov M11 = M8
    800 	mov M6 = M7
    801 	mov M13 = M2
    802 } ;;
    803 
    804 {	.mmi
    805 	mov M14 = Y
    806 	mov M5 = X
    807 	mov M3 = Z
    808 }
    809 {	.mmi
    810 	mov M8 = W
    811 	mov M7 = V
    812 	mov M2 = U
    813 } ;;
    814 
    815 .md5_II_round:
    816 	II4(.md5_II_round)
    817 
    818 {	.mib
    819 	nop 0x0
    820 	nop 0x0
    821 	br.ret.sptk.many QUICK_RTN
    822 } ;;
    823 
    824 	.endp md5_digest_GHI
    825 
    826 #define FFLOADU(a, b, c, d, M, P, N, s, offset)	\
    827 {	.mii ;					\
    828 (pMore) ld4 N = [DPtr], 4 ;			\
    829 	add Z = M, TRound ;			\
    830 	and Y = c, b ;				\
    831 }						\
    832 {	.mmi ;					\
    833 	andcm X = d, b ;;			\
    834 	add Z = Z, a ;				\
    835 	or Y = Y, X ;				\
    836 } ;;						\
    837 {	.mii ;					\
    838 	ld4 TRound = [TPtr], 4 ;		\
    839 	GETLW(W, P, offset) ;			\
    840 	add Z = Z, Y ;				\
    841 } ;;						\
    842 {	.mii ;					\
    843 	or W = W, DTmp ;			\
    844 	dep.z Y = Z, 32, 32 ;;			\
    845 	shrp Z = Z, Y, 64 - s ;			\
    846 } ;;						\
    847 {	.mii ;					\
    848 	add a = Z, b ;				\
    849 	GETRW(DTmp, P, offset) ;		\
    850 	mov P = W ;				\
    851 } ;;
    852 
    853 #define FFLOOPU(a, b, c, d, M, P, N, s, offset)		\
    854 {	.mii ;						\
    855 (pMore) ld4 N = [DPtr], 4 ;				\
    856 	add Z = M, TRound ;				\
    857 	and Y = c, b ;					\
    858 }							\
    859 {	.mmi ;						\
    860 	andcm X = d, b ;;				\
    861 	add Z = Z, a ;					\
    862 	or Y = Y, X ;					\
    863 } ;;							\
    864 {	.mii ;						\
    865 	ld4 TRound = [TPtr], 4 ;			\
    866 (pMore) GETLW(W, P, offset) 	;			\
    867 	add Z = Z, Y ;					\
    868 } ;;							\
    869 {	.mii ;						\
    870 (pMore) or W = W, DTmp ;				\
    871 	dep.z Y = Z, 32, 32 ;;				\
    872 	shrp Z = Z, Y, 64 - s ;				\
    873 } ;;							\
    874 {	.mii ;						\
    875 	add a = Z, b ;					\
    876 (pMore) GETRW(DTmp, P, offset) 	;			\
    877 (pMore) mov P = W ;					\
    878 }							\
    879 {	.mib ;						\
    880 	cmp.ne pMore, p0 = 0, LTrip ;			\
    881 	add LTrip = -1, LTrip ;				\
    882 	br.ctop.sptk.many .md5_FF_round##offset ;	\
    883 } ;;
    884 
    885 #define MD5FBLOCK(offset)						\
    886 	.type md5_digest_block##offset, @function ;			\
    887 									\
    888 	.align 32 ;							\
    889 	.proc md5_digest_block##offset ;				\
    890 	.prologue ;							\
    891 	.altrp QUICK_RTN ;						\
    892 	.body ;								\
    893 md5_digest_block##offset:						\
    894 {	.mmi ;								\
    895 	alloc PFS = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE ;	\
    896 	mov LTrip = 2 ;							\
    897 	mov ar.lc = 3 ;							\
    898 } ;;									\
    899 {	.mii ;								\
    900 	cmp.eq pMore, p0 = r0, r0 ;					\
    901 	mov ar.ec = 0 ;							\
    902 	nop 0x0 ;							\
    903 } ;;									\
    904 									\
    905 	.pred.rel "mutex", pLoad, pSkip ;				\
    906 .md5_FF_round##offset:							\
    907 	FFLOADU(A, B, C, D, M12, M13, RotateM0, 7, offset)		\
    908 	FFLOADU(D, A, B, C, M13, M14, RotateM1, 12, offset)		\
    909 	FFLOADU(C, D, A, B, M14, M15, RotateM2, 17, offset)		\
    910 	FFLOOPU(B, C, D, A, M15, RotateM0, RotateM3, 22, offset)	\
    911 									\
    912 {	.mib ;								\
    913 	nop 0x0 ;							\
    914 	nop 0x0 ;							\
    915 	br.cond.sptk.many md5_digest_GHI ;				\
    916 } ;;									\
    917 	.endp md5_digest_block##offset
    918 
    919 MD5FBLOCK(1)
    920 MD5FBLOCK(2)
    921 MD5FBLOCK(3)
    922 
    923 	.align 64
    924 	.type md5_constants, @object
    925 md5_constants:
    926 .md5_tbl_data_order:			// To ensure little-endian data
    927 					// order, code as bytes.
    928 	data1 0x78, 0xa4, 0x6a, 0xd7	//     0
    929 	data1 0x56, 0xb7, 0xc7, 0xe8	//     1
    930 	data1 0xdb, 0x70, 0x20, 0x24	//     2
    931 	data1 0xee, 0xce, 0xbd, 0xc1	//     3
    932 	data1 0xaf, 0x0f, 0x7c, 0xf5	//     4
    933 	data1 0x2a, 0xc6, 0x87, 0x47	//     5
    934 	data1 0x13, 0x46, 0x30, 0xa8	//     6
    935 	data1 0x01, 0x95, 0x46, 0xfd	//     7
    936 	data1 0xd8, 0x98, 0x80, 0x69	//     8
    937 	data1 0xaf, 0xf7, 0x44, 0x8b	//     9
    938 	data1 0xb1, 0x5b, 0xff, 0xff	//    10
    939 	data1 0xbe, 0xd7, 0x5c, 0x89	//    11
    940 	data1 0x22, 0x11, 0x90, 0x6b	//    12
    941 	data1 0x93, 0x71, 0x98, 0xfd	//    13
    942 	data1 0x8e, 0x43, 0x79, 0xa6	//    14
    943 	data1 0x21, 0x08, 0xb4, 0x49	//    15
    944 	data1 0x62, 0x25, 0x1e, 0xf6	//    16
    945 	data1 0x40, 0xb3, 0x40, 0xc0	//    17
    946 	data1 0x51, 0x5a, 0x5e, 0x26	//    18
    947 	data1 0xaa, 0xc7, 0xb6, 0xe9	//    19
    948 	data1 0x5d, 0x10, 0x2f, 0xd6	//    20
    949 	data1 0x53, 0x14, 0x44, 0x02	//    21
    950 	data1 0x81, 0xe6, 0xa1, 0xd8	//    22
    951 	data1 0xc8, 0xfb, 0xd3, 0xe7	//    23
    952 	data1 0xe6, 0xcd, 0xe1, 0x21	//    24
    953 	data1 0xd6, 0x07, 0x37, 0xc3	//    25
    954 	data1 0x87, 0x0d, 0xd5, 0xf4	//    26
    955 	data1 0xed, 0x14, 0x5a, 0x45	//    27
    956 	data1 0x05, 0xe9, 0xe3, 0xa9	//    28
    957 	data1 0xf8, 0xa3, 0xef, 0xfc	//    29
    958 	data1 0xd9, 0x02, 0x6f, 0x67	//    30
    959 	data1 0x8a, 0x4c, 0x2a, 0x8d	//    31
    960 	data1 0x42, 0x39, 0xfa, 0xff	//    32
    961 	data1 0x81, 0xf6, 0x71, 0x87	//    33
    962 	data1 0x22, 0x61, 0x9d, 0x6d	//    34
    963 	data1 0x0c, 0x38, 0xe5, 0xfd	//    35
    964 	data1 0x44, 0xea, 0xbe, 0xa4	//    36
    965 	data1 0xa9, 0xcf, 0xde, 0x4b	//    37
    966 	data1 0x60, 0x4b, 0xbb, 0xf6	//    38
    967 	data1 0x70, 0xbc, 0xbf, 0xbe	//    39
    968 	data1 0xc6, 0x7e, 0x9b, 0x28	//    40
    969 	data1 0xfa, 0x27, 0xa1, 0xea	//    41
    970 	data1 0x85, 0x30, 0xef, 0xd4	//    42
    971 	data1 0x05, 0x1d, 0x88, 0x04	//    43
    972 	data1 0x39, 0xd0, 0xd4, 0xd9	//    44
    973 	data1 0xe5, 0x99, 0xdb, 0xe6	//    45
    974 	data1 0xf8, 0x7c, 0xa2, 0x1f	//    46
    975 	data1 0x65, 0x56, 0xac, 0xc4	//    47
    976 	data1 0x44, 0x22, 0x29, 0xf4	//    48
    977 	data1 0x97, 0xff, 0x2a, 0x43	//    49
    978 	data1 0xa7, 0x23, 0x94, 0xab	//    50
    979 	data1 0x39, 0xa0, 0x93, 0xfc	//    51
    980 	data1 0xc3, 0x59, 0x5b, 0x65	//    52
    981 	data1 0x92, 0xcc, 0x0c, 0x8f	//    53
    982 	data1 0x7d, 0xf4, 0xef, 0xff	//    54
    983 	data1 0xd1, 0x5d, 0x84, 0x85	//    55
    984 	data1 0x4f, 0x7e, 0xa8, 0x6f	//    56
    985 	data1 0xe0, 0xe6, 0x2c, 0xfe	//    57
    986 	data1 0x14, 0x43, 0x01, 0xa3	//    58
    987 	data1 0xa1, 0x11, 0x08, 0x4e	//    59
    988 	data1 0x82, 0x7e, 0x53, 0xf7	//    60
    989 	data1 0x35, 0xf2, 0x3a, 0xbd	//    61
    990 	data1 0xbb, 0xd2, 0xd7, 0x2a	//    62
    991 	data1 0x91, 0xd3, 0x86, 0xeb	//    63
    992 .size	md5_constants#,64*4
    993