Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 #
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 #
     10 # Eternal question is what's wrong with compiler generated code? The
     11 # trick is that it's possible to reduce the number of shifts required
     12 # to perform rotations by maintaining copy of 32-bit value in upper
     13 # bits of 64-bit register. Just follow mux2 and shrp instructions...
     14 # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which
     15 # is >50% better than HP C and >2x better than gcc.
     16 
     17 $code=<<___;
     18 .ident  \"sha1-ia64.s, version 1.3\"
     19 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
     20 .explicit
     21 
     22 ___
     23 
     24 
     25 if ($^O eq "hpux") {
     26     $ADDP="addp4";
     27     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
     28 } else { $ADDP="add"; }
     29 
     30 #$human=1;
     31 if ($human) {	# useful for visual code auditing...
     32 	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
     33 	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
     34 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
     35 	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
     36 	@X= (	"X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7",
     37 		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
     38 }
     39 else {
     40 	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
     41 	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
     42 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
     43 	    (	"r14", "r15", "loc10", "loc11"	);
     44 	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
     45 		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
     46 }
     47 
     48 sub BODY_00_15 {
     49 local	*code=shift;
     50 my	($i,$a,$b,$c,$d,$e)=@_;
     51 my	$j=$i+1;
     52 my	$Xn=@X[$j%16];
     53 
     54 $code.=<<___ if ($i==0);
     55 { .mmi;	ld1	$X[$i]=[inp],2		    // MSB
     56 	ld1	tmp2=[tmp3],2		};;
     57 { .mmi;	ld1	tmp0=[inp],2
     58 	ld1	tmp4=[tmp3],2		    // LSB
     59 	dep	$X[$i]=$X[$i],tmp2,8,8	};;
     60 ___
     61 if ($i<15) {
     62 	$code.=<<___;
     63 { .mmi;	ld1	$Xn=[inp],2		    // forward Xload
     64 	nop.m	0x0
     65 	dep	tmp1=tmp0,tmp4,8,8	};;
     66 { .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
     67 	and	tmp4=$c,$b
     68 	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
     69 { .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
     70 	andcm	tmp1=$d,$b
     71 	dep.z	tmp5=$a,5,27		};; // a<<5
     72 { .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
     73 	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
     74 	extr.u	tmp1=$a,27,5		};; // a>>27
     75 { .mmi;	ld1	tmp0=[inp],2		    // forward Xload
     76 	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
     77 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
     78 { .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
     79 	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
     80 	mux2	tmp6=$a,0x44		};; // see b in next iteration
     81 { .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
     82 	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
     83 	mux2	$X[$i]=$X[$i],0x44	} //;;
     84 
     85 ___
     86 	}
     87 else	{
     88 	$code.=<<___;
     89 { .mii;	and	tmp3=$c,$b
     90 	dep	tmp1=tmp0,tmp4,8,8;;
     91 	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
     92 { .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
     93 	andcm	tmp1=$d,$b
     94 	dep.z	tmp5=$a,5,27		};; // a<<5
     95 { .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
     96 	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
     97 	extr.u	tmp1=$a,27,5		}   // a>>27
     98 { .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
     99 	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
    100 	nop.i	0			};;
    101 { .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
    102 	xor	$Xn=$Xn,tmp3		    // forward Xupdate
    103 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
    104 { .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
    105 	mux2	tmp6=$a,0x44		};; // see b in next iteration
    106 { .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
    107 	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
    108 	mux2	$X[$i]=$X[$i],0x44	};;
    109 
    110 ___
    111 	}
    112 }
    113 
    114 sub BODY_16_19 {
    115 local	*code=shift;
    116 my	($i,$a,$b,$c,$d,$e)=@_;
    117 my	$j=$i+1;
    118 my	$Xn=@X[$j%16];
    119 
    120 $code.=<<___;
    121 { .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
    122 	dep.z	tmp5=$a,5,27		}   // a<<5
    123 { .mib;	andcm	tmp1=$d,$b
    124 	and	tmp0=$c,$b		};;
    125 { .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
    126 	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
    127 	extr.u	tmp1=$a,27,5		}   // a>>27
    128 { .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
    129 	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
    130 	nop.i	0			};;
    131 { .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
    132 	xor	$Xn=$Xn,tmp3		    // forward Xupdate
    133 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
    134 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
    135 	mux2	tmp6=$a,0x44		};; // see b in next iteration
    136 { .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
    137 	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
    138 	nop.i	0			};;
    139 
    140 ___
    141 }
    142 
    143 sub BODY_20_39 {
    144 local	*code=shift;
    145 my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
    146 	$Konst = $K_20_39 if (!defined($Konst));
    147 my	$j=$i+1;
    148 my	$Xn=@X[$j%16];
    149 
    150 if ($i<79) {
    151 $code.=<<___;
    152 { .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
    153 	dep.z	tmp5=$a,5,27		}   // a<<5
    154 { .mib;	xor	tmp0=$c,$b
    155 	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
    156 { .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
    157 	extr.u	tmp1=$a,27,5		}   // a>>27
    158 { .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
    159 	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
    160 { .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
    161 	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
    162 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
    163 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
    164 	mux2	tmp6=$a,0x44		};; // see b in next iteration
    165 { .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
    166 	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
    167 	nop.i	0			};;
    168 
    169 ___
    170 }
    171 else {
    172 $code.=<<___;
    173 { .mib;	add	$e=$e,$Konst		    // e+=K_60_79
    174 	dep.z	tmp5=$a,5,27		}   // a<<5
    175 { .mib;	xor	tmp0=$c,$b
    176 	add	$h1=$h1,$a		};; // wrap up
    177 { .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
    178 	extr.u	tmp1=$a,27,5		}   // a>>27
    179 { .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
    180 	add	$h3=$h3,$c		};; // wrap up
    181 { .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
    182 	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
    183 	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
    184 { .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
    185 	add	tmp3=1,inp		    // used in unaligned codepath
    186 	add	$h4=$h4,$d		};; // wrap up
    187 
    188 ___
    189 }
    190 }
    191 
    192 sub BODY_40_59 {
    193 local	*code=shift;
    194 my	($i,$a,$b,$c,$d,$e)=@_;
    195 my	$j=$i+1;
    196 my	$Xn=@X[$j%16];
    197 
    198 $code.=<<___;
    199 { .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
    200 	dep.z	tmp5=$a,5,27		}   // a<<5
    201 { .mib;	and	tmp1=$c,$d
    202 	xor	tmp0=$c,$d		};;
    203 { .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
    204 	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
    205 	extr.u	tmp1=$a,27,5		}   // a>>27
    206 { .mmi;	and	tmp0=tmp0,$b
    207 	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
    208 	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
    209 { .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
    210 	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
    211 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
    212 { .mmi;	xor	$Xn=$Xn,tmp3
    213 	mux2	tmp6=$a,0x44		};; // see b in next iteration
    214 { .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
    215 	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
    216 	nop.i	0x0			};;
    217 
    218 ___
    219 }
    220 sub BODY_60_79	{ &BODY_20_39(@_,$K_60_79); }
    221 
    222 $code.=<<___;
    223 .text
    224 
    225 tmp0=r8;
    226 tmp1=r9;
    227 tmp2=r10;
    228 tmp3=r11;
    229 ctx=r32;	// in0
    230 inp=r33;	// in1
    231 
    232 // void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num);
    233 .global	sha1_block_data_order#
    234 .proc	sha1_block_data_order#
    235 .align	32
    236 sha1_block_data_order:
    237 	.prologue
    238 { .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
    239 	$ADDP	tmp0=4,ctx
    240 	.save	ar.lc,r3
    241 	mov	r3=ar.lc		}
    242 { .mmi;	$ADDP	ctx=0,ctx
    243 	$ADDP	inp=0,inp
    244 	mov	r2=pr			};;
    245 tmp4=in2;
    246 tmp5=loc12;
    247 tmp6=loc13;
    248 	.body
    249 { .mlx;	ld4	$h0=[ctx],8
    250 	movl	$K_00_19=0x5a827999	}
    251 { .mlx;	ld4	$h1=[tmp0],8
    252 	movl	$K_20_39=0x6ed9eba1	};;
    253 { .mlx;	ld4	$h2=[ctx],8
    254 	movl	$K_40_59=0x8f1bbcdc	}
    255 { .mlx;	ld4	$h3=[tmp0]
    256 	movl	$K_60_79=0xca62c1d6	};;
    257 { .mmi;	ld4	$h4=[ctx],-16
    258 	add	in2=-1,in2		    // adjust num for ar.lc
    259 	mov	ar.ec=1			};;
    260 { .mmi;	nop.m	0
    261 	add	tmp3=1,inp
    262 	mov	ar.lc=in2		};; // brp.loop.imp: too far
    263 
    264 .Ldtop:
    265 { .mmi;	mov	$A=$h0
    266 	mov	$B=$h1
    267 	mux2	tmp6=$h1,0x44		}
    268 { .mmi;	mov	$C=$h2
    269 	mov	$D=$h3
    270 	mov	$E=$h4			};;
    271 
    272 ___
    273 
    274 { my $i;
    275   my @V=($A,$B,$C,$D,$E);
    276 
    277 	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
    278 	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
    279 	for(;$i<40;$i++)	{ &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); }
    280 	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
    281 	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
    282 
    283 	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
    284 }
    285 
    286 $code.=<<___;
    287 { .mmb;	add	$h0=$h0,$A
    288 	add	$h2=$h2,$C
    289 	br.ctop.dptk.many	.Ldtop	};;
    290 .Ldend:
    291 { .mmi;	add	tmp0=4,ctx
    292 	mov	ar.lc=r3		};;
    293 { .mmi;	st4	[ctx]=$h0,8
    294 	st4	[tmp0]=$h1,8		};;
    295 { .mmi;	st4	[ctx]=$h2,8
    296 	st4	[tmp0]=$h3		};;
    297 { .mib;	st4	[ctx]=$h4,-16
    298 	mov	pr=r2,0x1ffff
    299 	br.ret.sptk.many	b0	};;
    300 .endp	sha1_block_data_order#
    301 stringz	"SHA1 block transform for IA64, CRYPTOGAMS by <appro\@openssl.org>"
    302 ___
    303 
    304 $output=shift and open STDOUT,">$output";
    305 print $code;
    306