Home | History | Annotate | Download | only in asm
      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL
      5 # project. The module is, however, dual licensed under OpenSSL and
      6 # CRYPTOGAMS licenses depending on where you obtain it. For further
      7 # details see http://www.openssl.org/~appro/cryptogams/.
      8 # ====================================================================
      9 
     10 # I let hardware handle unaligned input(*), except on page boundaries
     11 # (see below for details). Otherwise straightforward implementation
     12 # with X vector in register bank. The module is big-endian [which is
     13 # not big deal as there're no little-endian targets left around].
     14 #
     15 # (*) this means that this module is inappropriate for PPC403? Does
     16 #     anybody know if pre-POWER3 can sustain unaligned load?
     17 
     18 # 			-m64	-m32
     19 # ----------------------------------
     20 # PPC970,gcc-4.0.0	+76%	+59%
     21 # Power6,xlc-7		+68%	+33%
     22 
     23 $flavour = shift;
     24 
     25 if ($flavour =~ /64/) {
     26 	$SIZE_T	=8;
     27 	$LRSAVE	=2*$SIZE_T;
     28 	$UCMP	="cmpld";
     29 	$STU	="stdu";
     30 	$POP	="ld";
     31 	$PUSH	="std";
     32 } elsif ($flavour =~ /32/) {
     33 	$SIZE_T	=4;
     34 	$LRSAVE	=$SIZE_T;
     35 	$UCMP	="cmplw";
     36 	$STU	="stwu";
     37 	$POP	="lwz";
     38 	$PUSH	="stw";
     39 } else { die "nonsense $flavour"; }
     40 
     41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     42 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     43 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     44 die "can't locate ppc-xlate.pl";
     45 
     46 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
     47 
     48 $FRAME=24*$SIZE_T+64;
     49 $LOCALS=6*$SIZE_T;
     50 
     51 $K  ="r0";
     52 $sp ="r1";
     53 $toc="r2";
     54 $ctx="r3";
     55 $inp="r4";
     56 $num="r5";
     57 $t0 ="r15";
     58 $t1 ="r6";
     59 
     60 $A  ="r7";
     61 $B  ="r8";
     62 $C  ="r9";
     63 $D  ="r10";
     64 $E  ="r11";
     65 $T  ="r12";
     66 
     67 @V=($A,$B,$C,$D,$E,$T);
     68 @X=("r16","r17","r18","r19","r20","r21","r22","r23",
     69     "r24","r25","r26","r27","r28","r29","r30","r31");
     70 
     71 sub BODY_00_19 {
     72 my ($i,$a,$b,$c,$d,$e,$f)=@_;
     73 my $j=$i+1;
     74 $code.=<<___ if ($i==0);
     75 	lwz	@X[$i],`$i*4`($inp)
     76 ___
     77 $code.=<<___ if ($i<15);
     78 	lwz	@X[$j],`$j*4`($inp)
     79 	add	$f,$K,$e
     80 	rotlwi	$e,$a,5
     81 	add	$f,$f,@X[$i]
     82 	and	$t0,$c,$b
     83 	add	$f,$f,$e
     84 	andc	$t1,$d,$b
     85 	rotlwi	$b,$b,30
     86 	or	$t0,$t0,$t1
     87 	add	$f,$f,$t0
     88 ___
     89 $code.=<<___ if ($i>=15);
     90 	add	$f,$K,$e
     91 	rotlwi	$e,$a,5
     92 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
     93 	add	$f,$f,@X[$i%16]
     94 	and	$t0,$c,$b
     95 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
     96 	add	$f,$f,$e
     97 	andc	$t1,$d,$b
     98 	rotlwi	$b,$b,30
     99 	or	$t0,$t0,$t1
    100 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
    101 	add	$f,$f,$t0
    102 	rotlwi	@X[$j%16],@X[$j%16],1
    103 ___
    104 }
    105 
    106 sub BODY_20_39 {
    107 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    108 my $j=$i+1;
    109 $code.=<<___ if ($i<79);
    110 	add	$f,$K,$e
    111 	rotlwi	$e,$a,5
    112 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
    113 	add	$f,$f,@X[$i%16]
    114 	xor	$t0,$b,$c
    115 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
    116 	add	$f,$f,$e
    117 	rotlwi	$b,$b,30
    118 	xor	$t0,$t0,$d
    119 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
    120 	add	$f,$f,$t0
    121 	rotlwi	@X[$j%16],@X[$j%16],1
    122 ___
    123 $code.=<<___ if ($i==79);
    124 	add	$f,$K,$e
    125 	rotlwi	$e,$a,5
    126 	lwz	r16,0($ctx)
    127 	add	$f,$f,@X[$i%16]
    128 	xor	$t0,$b,$c
    129 	lwz	r17,4($ctx)
    130 	add	$f,$f,$e
    131 	rotlwi	$b,$b,30
    132 	lwz	r18,8($ctx)
    133 	xor	$t0,$t0,$d
    134 	lwz	r19,12($ctx)
    135 	add	$f,$f,$t0
    136 	lwz	r20,16($ctx)
    137 ___
    138 }
    139 
    140 sub BODY_40_59 {
    141 my ($i,$a,$b,$c,$d,$e,$f)=@_;
    142 my $j=$i+1;
    143 $code.=<<___;
    144 	add	$f,$K,$e
    145 	rotlwi	$e,$a,5
    146 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
    147 	add	$f,$f,@X[$i%16]
    148 	and	$t0,$b,$c
    149 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
    150 	add	$f,$f,$e
    151 	or	$t1,$b,$c
    152 	rotlwi	$b,$b,30
    153 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
    154 	and	$t1,$t1,$d
    155 	or	$t0,$t0,$t1
    156 	rotlwi	@X[$j%16],@X[$j%16],1
    157 	add	$f,$f,$t0
    158 ___
    159 }
    160 
    161 $code=<<___;
    162 .machine	"any"
    163 .text
    164 
    165 .globl	.sha1_block_data_order
    166 .align	4
    167 .sha1_block_data_order:
    168 	$STU	$sp,-$FRAME($sp)
    169 	mflr	r0
    170 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
    171 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
    172 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
    173 	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
    174 	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
    175 	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
    176 	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
    177 	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
    178 	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
    179 	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
    180 	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
    181 	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
    182 	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
    183 	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
    184 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
    185 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
    186 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
    187 	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
    188 	lwz	$A,0($ctx)
    189 	lwz	$B,4($ctx)
    190 	lwz	$C,8($ctx)
    191 	lwz	$D,12($ctx)
    192 	lwz	$E,16($ctx)
    193 	andi.	r0,$inp,3
    194 	bne	Lunaligned
    195 Laligned:
    196 	mtctr	$num
    197 	bl	Lsha1_block_private
    198 	b	Ldone
    199 
    200 ; PowerPC specification allows an implementation to be ill-behaved
    201 ; upon unaligned access which crosses page boundary. "Better safe
    202 ; than sorry" principle makes me treat it specially. But I don't
    203 ; look for particular offending word, but rather for 64-byte input
    204 ; block which crosses the boundary. Once found that block is aligned
    205 ; and hashed separately...
    206 .align	4
    207 Lunaligned:
    208 	subfic	$t1,$inp,4096
    209 	andi.	$t1,$t1,4095	; distance to closest page boundary
    210 	srwi.	$t1,$t1,6	; t1/=64
    211 	beq	Lcross_page
    212 	$UCMP	$num,$t1
    213 	ble-	Laligned	; didn't cross the page boundary
    214 	mtctr	$t1
    215 	subfc	$num,$t1,$num
    216 	bl	Lsha1_block_private
    217 Lcross_page:
    218 	li	$t1,16
    219 	mtctr	$t1
    220 	addi	r20,$sp,$LOCALS	; spot within the frame
    221 Lmemcpy:
    222 	lbz	r16,0($inp)
    223 	lbz	r17,1($inp)
    224 	lbz	r18,2($inp)
    225 	lbz	r19,3($inp)
    226 	addi	$inp,$inp,4
    227 	stb	r16,0(r20)
    228 	stb	r17,1(r20)
    229 	stb	r18,2(r20)
    230 	stb	r19,3(r20)
    231 	addi	r20,r20,4
    232 	bdnz	Lmemcpy
    233 
    234 	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
    235 	li	$t1,1
    236 	addi	$inp,$sp,$LOCALS
    237 	mtctr	$t1
    238 	bl	Lsha1_block_private
    239 	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
    240 	addic.	$num,$num,-1
    241 	bne-	Lunaligned
    242 
    243 Ldone:
    244 	$POP	r0,`$FRAME+$LRSAVE`($sp)
    245 	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
    246 	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
    247 	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
    248 	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
    249 	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
    250 	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
    251 	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
    252 	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
    253 	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
    254 	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
    255 	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
    256 	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
    257 	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
    258 	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
    259 	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
    260 	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
    261 	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
    262 	mtlr	r0
    263 	addi	$sp,$sp,$FRAME
    264 	blr
    265 	.long	0
    266 	.byte	0,12,4,1,0x80,18,3,0
    267 	.long	0
    268 ___
    269 
    270 # This is private block function, which uses tailored calling
    271 # interface, namely upon entry SHA_CTX is pre-loaded to given
    272 # registers and counter register contains amount of chunks to
    273 # digest...
    274 $code.=<<___;
    275 .align	4
    276 Lsha1_block_private:
    277 ___
    278 $code.=<<___;	# load K_00_19
    279 	lis	$K,0x5a82
    280 	ori	$K,$K,0x7999
    281 ___
    282 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
    283 $code.=<<___;	# load K_20_39
    284 	lis	$K,0x6ed9
    285 	ori	$K,$K,0xeba1
    286 ___
    287 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    288 $code.=<<___;	# load K_40_59
    289 	lis	$K,0x8f1b
    290 	ori	$K,$K,0xbcdc
    291 ___
    292 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
    293 $code.=<<___;	# load K_60_79
    294 	lis	$K,0xca62
    295 	ori	$K,$K,0xc1d6
    296 ___
    297 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
    298 $code.=<<___;
    299 	add	r16,r16,$E
    300 	add	r17,r17,$T
    301 	add	r18,r18,$A
    302 	add	r19,r19,$B
    303 	add	r20,r20,$C
    304 	stw	r16,0($ctx)
    305 	mr	$A,r16
    306 	stw	r17,4($ctx)
    307 	mr	$B,r17
    308 	stw	r18,8($ctx)
    309 	mr	$C,r18
    310 	stw	r19,12($ctx)
    311 	mr	$D,r19
    312 	stw	r20,16($ctx)
    313 	mr	$E,r20
    314 	addi	$inp,$inp,`16*4`
    315 	bdnz-	Lsha1_block_private
    316 	blr
    317 	.long	0
    318 	.byte	0,12,0x14,0,0,0,0,0
    319 ___
    320 $code.=<<___;
    321 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
    322 ___
    323 
    324 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    325 print $code;
    326 close STDOUT;
    327