Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # I let hardware handle unaligned input(*), except on page boundaries
     11 # (see below for details). Otherwise straightforward implementation
     12 # with X vector in register bank. The module is big-endian [which is
     13 # not big deal as there're no little-endian targets left around].
     14 #
     15 # (*) this means that this module is inappropriate for PPC403? Does
     16 #     anybody know if pre-POWER3 can sustain unaligned load?
     17 
     18 # 			-m64	-m32
     19 # ----------------------------------
     20 # PPC970,gcc-4.0.0	+76%	+59%
     21 # Power6,xlc-7		+68%	+33%
     22 
     23 $flavour = shift;
     24 
     25 if ($flavour =~ /64/) {
     26 	$SIZE_T	=8;
     27 	$UCMP	="cmpld";
     28 	$STU	="stdu";
     29 	$POP	="ld";
     30 	$PUSH	="std";
     31 } elsif ($flavour =~ /32/) {
     32 	$SIZE_T	=4;
     33 	$UCMP	="cmplw";
     34 	$STU	="stwu";
     35 	$POP	="lwz";
     36 	$PUSH	="stw";
     37 } else { die "nonsense $flavour"; }
     38 
     39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     40 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     41 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     42 die "can't locate ppc-xlate.pl";
     43 
     44 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     45 
     46 $FRAME=24*$SIZE_T;
     47 
     48 $K  ="r0";
     49 $sp ="r1";
     50 $toc="r2";
     51 $ctx="r3";
     52 $inp="r4";
     53 $num="r5";
     54 $t0 ="r15";
     55 $t1 ="r6";
     56 
     57 $A  ="r7";
     58 $B  ="r8";
     59 $C  ="r9";
     60 $D  ="r10";
     61 $E  ="r11";
     62 $T  ="r12";
     63 
     64 @V=($A,$B,$C,$D,$E,$T);
     65 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
     66     "r24","r25","r26","r27","r28","r29","r30","r31");
     67 
     68 sub BODY_00_19 {
     69 my ($i,$a,$b,$c,$d,$e,$f)=@_;
     70 my $j=$i+1;
     71 $code.=<<___ if ($i==0);
     72 	lwz	@X[$i],`$i*4`($inp)
     73 ___
     74 $code.=<<___ if ($i<15);
     75 	lwz	@X[$j],`$j*4`($inp)
     76 	add	$f,$K,$e
     77 	rotlwi	$e,$a,5
     78 	add	$f,$f,@X[$i]
     79 	and	$t0,$c,$b
     80 	add	$f,$f,$e
     81 	andc	$t1,$d,$b
     82 	rotlwi	$b,$b,30
     83 	or	$t0,$t0,$t1
     84 	add	$f,$f,$t0
     85 ___
     86 $code.=<<___ if ($i>=15);
     87 	add	$f,$K,$e
     88 	rotlwi	$e,$a,5
     89 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
     90 	add	$f,$f,@X[$i%16]
     91 	and	$t0,$c,$b
     92 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
     93 	add	$f,$f,$e
     94 	andc	$t1,$d,$b
     95 	rotlwi	$b,$b,30
     96 	or	$t0,$t0,$t1
     97 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
     98 	add	$f,$f,$t0
     99 	rotlwi	@X[$j%16],@X[$j%16],1
    100 ___
    101 }
    102 
    103 sub BODY_20_39 {
    104 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    105 my $j=$i+1;
    106 $code.=<<___ if ($i<79);
    107 	add	$f,$K,$e
    108 	rotlwi	$e,$a,5
    109 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
    110 	add	$f,$f,@X[$i%16]
    111 	xor	$t0,$b,$c
    112 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
    113 	add	$f,$f,$e
    114 	rotlwi	$b,$b,30
    115 	xor	$t0,$t0,$d
    116 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
    117 	add	$f,$f,$t0
    118 	rotlwi	@X[$j%16],@X[$j%16],1
    119 ___
    120 $code.=<<___ if ($i==79);
    121 	add	$f,$K,$e
    122 	rotlwi	$e,$a,5
    123 	lwz	r16,0($ctx)
    124 	add	$f,$f,@X[$i%16]
    125 	xor	$t0,$b,$c
    126 	lwz	r17,4($ctx)
    127 	add	$f,$f,$e
    128 	rotlwi	$b,$b,30
    129 	lwz	r18,8($ctx)
    130 	xor	$t0,$t0,$d
    131 	lwz	r19,12($ctx)
    132 	add	$f,$f,$t0
    133 	lwz	r20,16($ctx)
    134 ___
    135 }
    136 
    137 sub BODY_40_59 {
    138 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    139 my $j=$i+1;
    140 $code.=<<___;
    141 	add	$f,$K,$e
    142 	rotlwi	$e,$a,5
    143 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
    144 	add	$f,$f,@X[$i%16]
    145 	and	$t0,$b,$c
    146 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
    147 	add	$f,$f,$e
    148 	or	$t1,$b,$c
    149 	rotlwi	$b,$b,30
    150 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
    151 	and	$t1,$t1,$d
    152 	or	$t0,$t0,$t1
    153 	rotlwi	@X[$j%16],@X[$j%16],1
    154 	add	$f,$f,$t0
    155 ___
    156 }
    157 
    158 $code=<<___;
    159 .machine	"any"
    160 .text
    161 
    162 .globl	.sha1_block_data_order
    163 .align	4
    164 .sha1_block_data_order:
    165 	mflr	r0
    166 	$STU	$sp,`-($FRAME+64)`($sp)
    167 	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
    168 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
    169 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
    170 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
    171 	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
    172 	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
    173 	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
    174 	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
    175 	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
    176 	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
    177 	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
    178 	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
    179 	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
    180 	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
    181 	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
    182 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
    183 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
    184 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
    185 	lwz	$A,0($ctx)
    186 	lwz	$B,4($ctx)
    187 	lwz	$C,8($ctx)
    188 	lwz	$D,12($ctx)
    189 	lwz	$E,16($ctx)
    190 	andi.	r0,$inp,3
    191 	bne	Lunaligned
    192 Laligned:
    193 	mtctr	$num
    194 	bl	Lsha1_block_private
    195 Ldone:
    196 	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
    197 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
    198 	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
    199 	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
    200 	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
    201 	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
    202 	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
    203 	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
    204 	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
    205 	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
    206 	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
    207 	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
    208 	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
    209 	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
    210 	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
    211 	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
    212 	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
    213 	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
    214 	mtlr	r0
    215 	addi	$sp,$sp,`$FRAME+64`
    216 	blr
    217 ___
    218 
    219 # PowerPC specification allows an implementation to be ill-behaved
    220 # upon unaligned access which crosses page boundary. "Better safe
    221 # than sorry" principle makes me treat it specially. But I don't
    222 # look for particular offending word, but rather for 64-byte input
    223 # block which crosses the boundary. Once found that block is aligned
    224 # and hashed separately...
    225 $code.=<<___;
    226 .align	4
    227 Lunaligned:
    228 	subfic	$t1,$inp,4096
    229 	andi.	$t1,$t1,4095	; distance to closest page boundary
    230 	srwi.	$t1,$t1,6	; t1/=64
    231 	beq	Lcross_page
    232 	$UCMP	$num,$t1
    233 	ble-	Laligned	; didn't cross the page boundary
    234 	mtctr	$t1
    235 	subfc	$num,$t1,$num
    236 	bl	Lsha1_block_private
    237 Lcross_page:
    238 	li	$t1,16
    239 	mtctr	$t1
    240 	addi	r20,$sp,$FRAME	; spot below the frame
    241 Lmemcpy:
    242 	lbz	r16,0($inp)
    243 	lbz	r17,1($inp)
    244 	lbz	r18,2($inp)
    245 	lbz	r19,3($inp)
    246 	addi	$inp,$inp,4
    247 	stb	r16,0(r20)
    248 	stb	r17,1(r20)
    249 	stb	r18,2(r20)
    250 	stb	r19,3(r20)
    251 	addi	r20,r20,4
    252 	bdnz	Lmemcpy
    253 
    254 	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
    255 	li	$t1,1
    256 	addi	$inp,$sp,$FRAME
    257 	mtctr	$t1
    258 	bl	Lsha1_block_private
    259 	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
    260 	addic.	$num,$num,-1
    261 	bne-	Lunaligned
    262 	b	Ldone
    263 ___
    264 
    265 # This is private block function, which uses tailored calling
    266 # interface, namely upon entry SHA_CTX is pre-loaded to given
    267 # registers and counter register contains amount of chunks to
    268 # digest...
    269 $code.=<<___;
    270 .align	4
    271 Lsha1_block_private:
    272 ___
    273 $code.=<<___;	# load K_00_19
    274 	lis	$K,0x5a82
    275 	ori	$K,$K,0x7999
    276 ___
    277 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    278 $code.=<<___;	# load K_20_39
    279 	lis	$K,0x6ed9
    280 	ori	$K,$K,0xeba1
    281 ___
    282 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    283 $code.=<<___;	# load K_40_59
    284 	lis	$K,0x8f1b
    285 	ori	$K,$K,0xbcdc
    286 ___
    287 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    288 $code.=<<___;	# load K_60_79
    289 	lis	$K,0xca62
    290 	ori	$K,$K,0xc1d6
    291 ___
    292 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    293 $code.=<<___;
    294 	add	r16,r16,$E
    295 	add	r17,r17,$T
    296 	add	r18,r18,$A
    297 	add	r19,r19,$B
    298 	add	r20,r20,$C
    299 	stw	r16,0($ctx)
    300 	mr	$A,r16
    301 	stw	r17,4($ctx)
    302 	mr	$B,r17
    303 	stw	r18,8($ctx)
    304 	mr	$C,r18
    305 	stw	r19,12($ctx)
    306 	mr	$D,r19
    307 	stw	r20,16($ctx)
    308 	mr	$E,r20
    309 	addi	$inp,$inp,`16*4`
    310 	bdnz-	Lsha1_block_private
    311 	blr
    312 ___
    313 $code.=<<___;
    314 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
    315 ___
    316 
    317 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    318 print $code;
    319 close STDOUT;
    320