1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # I let hardware handle unaligned input(*), except on page boundaries 11 # (see below for details). Otherwise straightforward implementation 12 # with X vector in register bank. The module is big-endian [which is 13 # not big deal as there're no little-endian targets left around]. 14 # 15 # (*) this means that this module is inappropriate for PPC403? Does 16 # anybody know if pre-POWER3 can sustain unaligned load? 17 18 # -m64 -m32 19 # ---------------------------------- 20 # PPC970,gcc-4.0.0 +76% +59% 21 # Power6,xlc-7 +68% +33% 22 23 $flavour = shift; 24 25 if ($flavour =~ /64/) { 26 $SIZE_T =8; 27 $UCMP ="cmpld"; 28 $STU ="stdu"; 29 $POP ="ld"; 30 $PUSH ="std"; 31 } elsif ($flavour =~ /32/) { 32 $SIZE_T =4; 33 $UCMP ="cmplw"; 34 $STU ="stwu"; 35 $POP ="lwz"; 36 $PUSH ="stw"; 37 } else { die "nonsense $flavour"; } 38 39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 41 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 42 die "can't locate ppc-xlate.pl"; 43 44 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 45 46 $FRAME=24*$SIZE_T; 47 48 $K ="r0"; 49 $sp ="r1"; 50 $toc="r2"; 51 $ctx="r3"; 52 $inp="r4"; 53 $num="r5"; 54 $t0 ="r15"; 55 $t1 ="r6"; 56 57 $A ="r7"; 58 $B ="r8"; 59 $C ="r9"; 60 $D ="r10"; 61 $E ="r11"; 62 $T ="r12"; 63 64 @V=($A,$B,$C,$D,$E,$T); 65 @X=("r16","r17","r18","r19","r20","r21","r22","r23", 66 "r24","r25","r26","r27","r28","r29","r30","r31"); 67 68 sub BODY_00_19 { 69 my ($i,$a,$b,$c,$d,$e,$f)=@_; 70 my $j=$i+1; 71 $code.=<<___ if ($i==0); 72 lwz @X[$i],`$i*4`($inp) 73 ___ 74 $code.=<<___ if ($i<15); 75 lwz @X[$j],`$j*4`($inp) 76 add $f,$K,$e 77 rotlwi $e,$a,5 78 add $f,$f,@X[$i] 79 and $t0,$c,$b 80 add $f,$f,$e 81 andc $t1,$d,$b 82 rotlwi $b,$b,30 83 or $t0,$t0,$t1 84 add $f,$f,$t0 85 ___ 86 $code.=<<___ if ($i>=15); 87 add $f,$K,$e 88 rotlwi $e,$a,5 89 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 90 add $f,$f,@X[$i%16] 91 and $t0,$c,$b 92 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 93 add $f,$f,$e 94 andc $t1,$d,$b 95 rotlwi $b,$b,30 96 or $t0,$t0,$t1 97 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 98 add $f,$f,$t0 99 rotlwi @X[$j%16],@X[$j%16],1 100 ___ 101 } 102 103 sub BODY_20_39 { 104 my ($i,$a,$b,$c,$d,$e,$f)=@_; 105 my $j=$i+1; 106 $code.=<<___ if ($i<79); 107 add $f,$K,$e 108 rotlwi $e,$a,5 109 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 110 add $f,$f,@X[$i%16] 111 xor $t0,$b,$c 112 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 113 add $f,$f,$e 114 rotlwi $b,$b,30 115 xor $t0,$t0,$d 116 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 117 add $f,$f,$t0 118 rotlwi @X[$j%16],@X[$j%16],1 119 ___ 120 $code.=<<___ if ($i==79); 121 add $f,$K,$e 122 rotlwi $e,$a,5 123 lwz r16,0($ctx) 124 add $f,$f,@X[$i%16] 125 xor $t0,$b,$c 126 lwz r17,4($ctx) 127 add $f,$f,$e 128 rotlwi $b,$b,30 129 lwz r18,8($ctx) 130 xor $t0,$t0,$d 131 lwz r19,12($ctx) 132 add $f,$f,$t0 133 lwz r20,16($ctx) 134 ___ 135 } 136 137 sub BODY_40_59 { 138 my ($i,$a,$b,$c,$d,$e,$f)=@_; 139 my $j=$i+1; 140 $code.=<<___; 141 add $f,$K,$e 142 rotlwi $e,$a,5 143 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 144 add $f,$f,@X[$i%16] 145 and $t0,$b,$c 146 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 147 add $f,$f,$e 148 or $t1,$b,$c 149 rotlwi $b,$b,30 150 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 151 and $t1,$t1,$d 152 or $t0,$t0,$t1 153 rotlwi @X[$j%16],@X[$j%16],1 154 add $f,$f,$t0 155 ___ 156 } 157 158 $code=<<___; 159 .machine "any" 160 .text 161 162 .globl .sha1_block_data_order 163 .align 4 164 .sha1_block_data_order: 165 mflr r0 166 $STU $sp,`-($FRAME+64)`($sp) 167 $PUSH r0,`$FRAME-$SIZE_T*18`($sp) 168 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 169 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 170 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 171 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 172 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 173 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 174 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 175 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 176 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 177 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 178 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 179 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 180 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 181 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 182 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 183 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 184 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 185 lwz $A,0($ctx) 186 lwz $B,4($ctx) 187 lwz $C,8($ctx) 188 lwz $D,12($ctx) 189 lwz $E,16($ctx) 190 andi. r0,$inp,3 191 bne Lunaligned 192 Laligned: 193 mtctr $num 194 bl Lsha1_block_private 195 Ldone: 196 $POP r0,`$FRAME-$SIZE_T*18`($sp) 197 $POP r15,`$FRAME-$SIZE_T*17`($sp) 198 $POP r16,`$FRAME-$SIZE_T*16`($sp) 199 $POP r17,`$FRAME-$SIZE_T*15`($sp) 200 $POP r18,`$FRAME-$SIZE_T*14`($sp) 201 $POP r19,`$FRAME-$SIZE_T*13`($sp) 202 $POP r20,`$FRAME-$SIZE_T*12`($sp) 203 $POP r21,`$FRAME-$SIZE_T*11`($sp) 204 $POP r22,`$FRAME-$SIZE_T*10`($sp) 205 $POP r23,`$FRAME-$SIZE_T*9`($sp) 206 $POP r24,`$FRAME-$SIZE_T*8`($sp) 207 $POP r25,`$FRAME-$SIZE_T*7`($sp) 208 $POP r26,`$FRAME-$SIZE_T*6`($sp) 209 $POP r27,`$FRAME-$SIZE_T*5`($sp) 210 $POP r28,`$FRAME-$SIZE_T*4`($sp) 211 $POP r29,`$FRAME-$SIZE_T*3`($sp) 212 $POP r30,`$FRAME-$SIZE_T*2`($sp) 213 $POP r31,`$FRAME-$SIZE_T*1`($sp) 214 mtlr r0 215 addi $sp,$sp,`$FRAME+64` 216 blr 217 ___ 218 219 # PowerPC specification allows an implementation to be ill-behaved 220 # upon unaligned access which crosses page boundary. "Better safe 221 # than sorry" principle makes me treat it specially. But I don't 222 # look for particular offending word, but rather for 64-byte input 223 # block which crosses the boundary. Once found that block is aligned 224 # and hashed separately... 225 $code.=<<___; 226 .align 4 227 Lunaligned: 228 subfic $t1,$inp,4096 229 andi. $t1,$t1,4095 ; distance to closest page boundary 230 srwi. $t1,$t1,6 ; t1/=64 231 beq Lcross_page 232 $UCMP $num,$t1 233 ble- Laligned ; didn't cross the page boundary 234 mtctr $t1 235 subfc $num,$t1,$num 236 bl Lsha1_block_private 237 Lcross_page: 238 li $t1,16 239 mtctr $t1 240 addi r20,$sp,$FRAME ; spot below the frame 241 Lmemcpy: 242 lbz r16,0($inp) 243 lbz r17,1($inp) 244 lbz r18,2($inp) 245 lbz r19,3($inp) 246 addi $inp,$inp,4 247 stb r16,0(r20) 248 stb r17,1(r20) 249 stb r18,2(r20) 250 stb r19,3(r20) 251 addi r20,r20,4 252 bdnz Lmemcpy 253 254 $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) 255 li $t1,1 256 addi $inp,$sp,$FRAME 257 mtctr $t1 258 bl Lsha1_block_private 259 $POP $inp,`$FRAME-$SIZE_T*19`($sp) 260 addic. $num,$num,-1 261 bne- Lunaligned 262 b Ldone 263 ___ 264 265 # This is private block function, which uses tailored calling 266 # interface, namely upon entry SHA_CTX is pre-loaded to given 267 # registers and counter register contains amount of chunks to 268 # digest... 269 $code.=<<___; 270 .align 4 271 Lsha1_block_private: 272 ___ 273 $code.=<<___; # load K_00_19 274 lis $K,0x5a82 275 ori $K,$K,0x7999 276 ___ 277 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 278 $code.=<<___; # load K_20_39 279 lis $K,0x6ed9 280 ori $K,$K,0xeba1 281 ___ 282 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 283 $code.=<<___; # load K_40_59 284 lis $K,0x8f1b 285 ori $K,$K,0xbcdc 286 ___ 287 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 288 $code.=<<___; # load K_60_79 289 lis $K,0xca62 290 ori $K,$K,0xc1d6 291 ___ 292 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 293 $code.=<<___; 294 add r16,r16,$E 295 add r17,r17,$T 296 add r18,r18,$A 297 add r19,r19,$B 298 add r20,r20,$C 299 stw r16,0($ctx) 300 mr $A,r16 301 stw r17,4($ctx) 302 mr $B,r17 303 stw r18,8($ctx) 304 mr $C,r18 305 stw r19,12($ctx) 306 mr $D,r19 307 stw r20,16($ctx) 308 mr $E,r20 309 addi $inp,$inp,`16*4` 310 bdnz- Lsha1_block_private 311 blr 312 ___ 313 $code.=<<___; 314 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 315 ___ 316 317 $code =~ s/\`([^\`]*)\`/eval $1/gem; 318 print $code; 319 close STDOUT; 320