1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro (at] fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 10 # I let hardware handle unaligned input(*), except on page boundaries 11 # (see below for details). Otherwise straightforward implementation 12 # with X vector in register bank. The module is big-endian [which is 13 # not big deal as there're no little-endian targets left around]. 14 # 15 # (*) this means that this module is inappropriate for PPC403? Does 16 # anybody know if pre-POWER3 can sustain unaligned load? 17 18 # -m64 -m32 19 # ---------------------------------- 20 # PPC970,gcc-4.0.0 +76% +59% 21 # Power6,xlc-7 +68% +33% 22 23 $flavour = shift; 24 25 if ($flavour =~ /64/) { 26 $SIZE_T =8; 27 $LRSAVE =2*$SIZE_T; 28 $UCMP ="cmpld"; 29 $STU ="stdu"; 30 $POP ="ld"; 31 $PUSH ="std"; 32 } elsif ($flavour =~ /32/) { 33 $SIZE_T =4; 34 $LRSAVE =$SIZE_T; 35 $UCMP ="cmplw"; 36 $STU ="stwu"; 37 $POP ="lwz"; 38 $PUSH ="stw"; 39 } else { die "nonsense $flavour"; } 40 41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 43 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 44 die "can't locate ppc-xlate.pl"; 45 46 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 47 48 $FRAME=24*$SIZE_T+64; 49 $LOCALS=6*$SIZE_T; 50 51 $K ="r0"; 52 $sp ="r1"; 53 $toc="r2"; 54 $ctx="r3"; 55 $inp="r4"; 56 $num="r5"; 57 $t0 ="r15"; 58 $t1 ="r6"; 59 60 $A ="r7"; 61 $B ="r8"; 62 $C ="r9"; 63 $D ="r10"; 64 $E ="r11"; 65 $T ="r12"; 66 67 @V=($A,$B,$C,$D,$E,$T); 68 @X=("r16","r17","r18","r19","r20","r21","r22","r23", 69 "r24","r25","r26","r27","r28","r29","r30","r31"); 70 71 sub BODY_00_19 { 72 my ($i,$a,$b,$c,$d,$e,$f)=@_; 73 my $j=$i+1; 74 $code.=<<___ if ($i==0); 75 lwz @X[$i],`$i*4`($inp) 76 ___ 77 $code.=<<___ if ($i<15); 78 lwz @X[$j],`$j*4`($inp) 79 add $f,$K,$e 80 rotlwi $e,$a,5 81 add $f,$f,@X[$i] 82 and $t0,$c,$b 83 add $f,$f,$e 84 andc $t1,$d,$b 85 rotlwi $b,$b,30 86 or $t0,$t0,$t1 87 add $f,$f,$t0 88 ___ 89 $code.=<<___ if ($i>=15); 90 add $f,$K,$e 91 rotlwi $e,$a,5 92 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 93 add $f,$f,@X[$i%16] 94 and $t0,$c,$b 95 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 96 add $f,$f,$e 97 andc $t1,$d,$b 98 rotlwi $b,$b,30 99 or $t0,$t0,$t1 100 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 101 add $f,$f,$t0 102 rotlwi @X[$j%16],@X[$j%16],1 103 ___ 104 } 105 106 sub BODY_20_39 { 107 my ($i,$a,$b,$c,$d,$e,$f)=@_; 108 my $j=$i+1; 109 $code.=<<___ if ($i<79); 110 add $f,$K,$e 111 rotlwi $e,$a,5 112 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 113 add $f,$f,@X[$i%16] 114 xor $t0,$b,$c 115 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 116 add $f,$f,$e 117 rotlwi $b,$b,30 118 xor $t0,$t0,$d 119 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 120 add $f,$f,$t0 121 rotlwi @X[$j%16],@X[$j%16],1 122 ___ 123 $code.=<<___ if ($i==79); 124 add $f,$K,$e 125 rotlwi $e,$a,5 126 lwz r16,0($ctx) 127 add $f,$f,@X[$i%16] 128 xor $t0,$b,$c 129 lwz r17,4($ctx) 130 add $f,$f,$e 131 rotlwi $b,$b,30 132 lwz r18,8($ctx) 133 xor $t0,$t0,$d 134 lwz r19,12($ctx) 135 add $f,$f,$t0 136 lwz r20,16($ctx) 137 ___ 138 } 139 140 sub BODY_40_59 { 141 my ($i,$a,$b,$c,$d,$e,$f)=@_; 142 my $j=$i+1; 143 $code.=<<___; 144 add $f,$K,$e 145 rotlwi $e,$a,5 146 xor @X[$j%16],@X[$j%16],@X[($j+2)%16] 147 add $f,$f,@X[$i%16] 148 and $t0,$b,$c 149 xor @X[$j%16],@X[$j%16],@X[($j+8)%16] 150 add $f,$f,$e 151 or $t1,$b,$c 152 rotlwi $b,$b,30 153 xor @X[$j%16],@X[$j%16],@X[($j+13)%16] 154 and $t1,$t1,$d 155 or $t0,$t0,$t1 156 rotlwi @X[$j%16],@X[$j%16],1 157 add $f,$f,$t0 158 ___ 159 } 160 161 $code=<<___; 162 .machine "any" 163 .text 164 165 .globl .sha1_block_data_order 166 .align 4 167 .sha1_block_data_order: 168 $STU $sp,-$FRAME($sp) 169 mflr r0 170 $PUSH r15,`$FRAME-$SIZE_T*17`($sp) 171 $PUSH r16,`$FRAME-$SIZE_T*16`($sp) 172 $PUSH r17,`$FRAME-$SIZE_T*15`($sp) 173 $PUSH r18,`$FRAME-$SIZE_T*14`($sp) 174 $PUSH r19,`$FRAME-$SIZE_T*13`($sp) 175 $PUSH r20,`$FRAME-$SIZE_T*12`($sp) 176 $PUSH r21,`$FRAME-$SIZE_T*11`($sp) 177 $PUSH r22,`$FRAME-$SIZE_T*10`($sp) 178 $PUSH r23,`$FRAME-$SIZE_T*9`($sp) 179 $PUSH r24,`$FRAME-$SIZE_T*8`($sp) 180 $PUSH r25,`$FRAME-$SIZE_T*7`($sp) 181 $PUSH r26,`$FRAME-$SIZE_T*6`($sp) 182 $PUSH r27,`$FRAME-$SIZE_T*5`($sp) 183 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 184 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 185 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 186 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 187 $PUSH r0,`$FRAME+$LRSAVE`($sp) 188 lwz $A,0($ctx) 189 lwz $B,4($ctx) 190 lwz $C,8($ctx) 191 lwz $D,12($ctx) 192 lwz $E,16($ctx) 193 andi. r0,$inp,3 194 bne Lunaligned 195 Laligned: 196 mtctr $num 197 bl Lsha1_block_private 198 b Ldone 199 200 ; PowerPC specification allows an implementation to be ill-behaved 201 ; upon unaligned access which crosses page boundary. "Better safe 202 ; than sorry" principle makes me treat it specially. But I don't 203 ; look for particular offending word, but rather for 64-byte input 204 ; block which crosses the boundary. Once found that block is aligned 205 ; and hashed separately... 206 .align 4 207 Lunaligned: 208 subfic $t1,$inp,4096 209 andi. $t1,$t1,4095 ; distance to closest page boundary 210 srwi. $t1,$t1,6 ; t1/=64 211 beq Lcross_page 212 $UCMP $num,$t1 213 ble- Laligned ; didn't cross the page boundary 214 mtctr $t1 215 subfc $num,$t1,$num 216 bl Lsha1_block_private 217 Lcross_page: 218 li $t1,16 219 mtctr $t1 220 addi r20,$sp,$LOCALS ; spot within the frame 221 Lmemcpy: 222 lbz r16,0($inp) 223 lbz r17,1($inp) 224 lbz r18,2($inp) 225 lbz r19,3($inp) 226 addi $inp,$inp,4 227 stb r16,0(r20) 228 stb r17,1(r20) 229 stb r18,2(r20) 230 stb r19,3(r20) 231 addi r20,r20,4 232 bdnz Lmemcpy 233 234 $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) 235 li $t1,1 236 addi $inp,$sp,$LOCALS 237 mtctr $t1 238 bl Lsha1_block_private 239 $POP $inp,`$FRAME-$SIZE_T*18`($sp) 240 addic. $num,$num,-1 241 bne- Lunaligned 242 243 Ldone: 244 $POP r0,`$FRAME+$LRSAVE`($sp) 245 $POP r15,`$FRAME-$SIZE_T*17`($sp) 246 $POP r16,`$FRAME-$SIZE_T*16`($sp) 247 $POP r17,`$FRAME-$SIZE_T*15`($sp) 248 $POP r18,`$FRAME-$SIZE_T*14`($sp) 249 $POP r19,`$FRAME-$SIZE_T*13`($sp) 250 $POP r20,`$FRAME-$SIZE_T*12`($sp) 251 $POP r21,`$FRAME-$SIZE_T*11`($sp) 252 $POP r22,`$FRAME-$SIZE_T*10`($sp) 253 $POP r23,`$FRAME-$SIZE_T*9`($sp) 254 $POP r24,`$FRAME-$SIZE_T*8`($sp) 255 $POP r25,`$FRAME-$SIZE_T*7`($sp) 256 $POP r26,`$FRAME-$SIZE_T*6`($sp) 257 $POP r27,`$FRAME-$SIZE_T*5`($sp) 258 $POP r28,`$FRAME-$SIZE_T*4`($sp) 259 $POP r29,`$FRAME-$SIZE_T*3`($sp) 260 $POP r30,`$FRAME-$SIZE_T*2`($sp) 261 $POP r31,`$FRAME-$SIZE_T*1`($sp) 262 mtlr r0 263 addi $sp,$sp,$FRAME 264 blr 265 .long 0 266 .byte 0,12,4,1,0x80,18,3,0 267 .long 0 268 ___ 269 270 # This is private block function, which uses tailored calling 271 # interface, namely upon entry SHA_CTX is pre-loaded to given 272 # registers and counter register contains amount of chunks to 273 # digest... 274 $code.=<<___; 275 .align 4 276 Lsha1_block_private: 277 ___ 278 $code.=<<___; # load K_00_19 279 lis $K,0x5a82 280 ori $K,$K,0x7999 281 ___ 282 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 283 $code.=<<___; # load K_20_39 284 lis $K,0x6ed9 285 ori $K,$K,0xeba1 286 ___ 287 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 288 $code.=<<___; # load K_40_59 289 lis $K,0x8f1b 290 ori $K,$K,0xbcdc 291 ___ 292 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 293 $code.=<<___; # load K_60_79 294 lis $K,0xca62 295 ori $K,$K,0xc1d6 296 ___ 297 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 298 $code.=<<___; 299 add r16,r16,$E 300 add r17,r17,$T 301 add r18,r18,$A 302 add r19,r19,$B 303 add r20,r20,$C 304 stw r16,0($ctx) 305 mr $A,r16 306 stw r17,4($ctx) 307 mr $B,r17 308 stw r18,8($ctx) 309 mr $C,r18 310 stw r19,12($ctx) 311 mr $D,r19 312 stw r20,16($ctx) 313 mr $E,r20 314 addi $inp,$inp,`16*4` 315 bdnz- Lsha1_block_private 316 blr 317 .long 0 318 .byte 0,12,0x14,0,0,0,0,0 319 ___ 320 $code.=<<___; 321 .asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>" 322 ___ 323 324 $code =~ s/\`([^\`]*)\`/eval $1/gem; 325 print $code; 326 close STDOUT; 327