Home | History | Annotate | Download | only in crypto
      1 #!/usr/bin/env perl
      2 
      3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
      4 push(@INC, "${dir}perlasm", "perlasm");
      5 require "x86asm.pl";
      6 
      7 &asm_init($ARGV[0],"x86cpuid");
      8 
      9 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
     10 
     11 &function_begin("OPENSSL_ia32_cpuid");
     12 	&xor	("edx","edx");
     13 	&pushf	();
     14 	&pop	("eax");
     15 	&mov	("ecx","eax");
     16 	&xor	("eax",1<<21);
     17 	&push	("eax");
     18 	&popf	();
     19 	&pushf	();
     20 	&pop	("eax");
     21 	&xor	("ecx","eax");
     22 	&xor	("eax","eax");
     23 	&bt	("ecx",21);
     24 	&jnc	(&label("nocpuid"));
     25 	&cpuid	();
     26 	&mov	("edi","eax");		# max value for standard query level
     27 
     28 	&xor	("eax","eax");
     29 	&cmp	("ebx",0x756e6547);	# "Genu"
     30 	&setne	(&LB("eax"));
     31 	&mov	("ebp","eax");
     32 	&cmp	("edx",0x49656e69);	# "ineI"
     33 	&setne	(&LB("eax"));
     34 	&or	("ebp","eax");
     35 	&cmp	("ecx",0x6c65746e);	# "ntel"
     36 	&setne	(&LB("eax"));
     37 	&or	("ebp","eax");		# 0 indicates Intel CPU
     38 	&jz	(&label("intel"));
     39 
     40 	&cmp	("ebx",0x68747541);	# "Auth"
     41 	&setne	(&LB("eax"));
     42 	&mov	("esi","eax");
     43 	&cmp	("edx",0x69746E65);	# "enti"
     44 	&setne	(&LB("eax"));
     45 	&or	("esi","eax");
     46 	&cmp	("ecx",0x444D4163);	# "cAMD"
     47 	&setne	(&LB("eax"));
     48 	&or	("esi","eax");		# 0 indicates AMD CPU
     49 	&jnz	(&label("intel"));
     50 
     51 	# AMD specific
     52 	&mov	("eax",0x80000000);
     53 	&cpuid	();
     54 	&cmp	("eax",0x80000001);
     55 	&jb	(&label("intel"));
     56 	&mov	("esi","eax");
     57 	&mov	("eax",0x80000001);
     58 	&cpuid	();
     59 	&or	("ebp","ecx");
     60 	&and	("ebp",1<<11|1);	# isolate XOP bit
     61 	&cmp	("esi",0x80000008);
     62 	&jb	(&label("intel"));
     63 
     64 	&mov	("eax",0x80000008);
     65 	&cpuid	();
     66 	&movz	("esi",&LB("ecx"));	# number of cores - 1
     67 	&inc	("esi");		# number of cores
     68 
     69 	&mov	("eax",1);
     70 	&cpuid	();
     71 	&bt	("edx",28);
     72 	&jnc	(&label("generic"));
     73 	&shr	("ebx",16);
     74 	&and	("ebx",0xff);
     75 	&cmp	("ebx","esi");
     76 	&ja	(&label("generic"));
     77 	&and	("edx",0xefffffff);	# clear hyper-threading bit
     78 	&jmp	(&label("generic"));
     79 	
     80 &set_label("intel");
     81 	&cmp	("edi",4);
     82 	&mov	("edi",-1);
     83 	&jb	(&label("nocacheinfo"));
     84 
     85 	&mov	("eax",4);
     86 	&mov	("ecx",0);		# query L1D
     87 	&cpuid	();
     88 	&mov	("edi","eax");
     89 	&shr	("edi",14);
     90 	&and	("edi",0xfff);		# number of cores -1 per L1D
     91 
     92 &set_label("nocacheinfo");
     93 	&mov	("eax",1);
     94 	&cpuid	();
     95 	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
     96 	&cmp	("ebp",0);
     97 	&jne	(&label("notintel"));
     98 	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
     99 	&and	(&HB("eax"),15);	# familiy ID
    100 	&cmp	(&HB("eax"),15);	# P4?
    101 	&jne	(&label("notintel"));
    102 	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
    103 &set_label("notintel");
    104 	&bt	("edx",28);		# test hyper-threading bit
    105 	&jnc	(&label("generic"));
    106 	&and	("edx",0xefffffff);
    107 	&cmp	("edi",0);
    108 	&je	(&label("generic"));
    109 
    110 	&or	("edx",0x10000000);
    111 	&shr	("ebx",16);
    112 	&cmp	(&LB("ebx"),1);
    113 	&ja	(&label("generic"));
    114 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
    115 
    116 &set_label("generic");
    117 	&and	("ebp",1<<11);		# isolate AMD XOP flag
    118 	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
    119 	&mov	("esi","edx");
    120 	&or	("ebp","ecx");		# merge AMD XOP flag
    121 
    122 	&bt	("ecx",27);		# check OSXSAVE bit
    123 	&jnc	(&label("clear_avx"));
    124 	&xor	("ecx","ecx");
    125 	&data_byte(0x0f,0x01,0xd0);	# xgetbv
    126 	&and	("eax",6);
    127 	&cmp	("eax",6);
    128 	&je	(&label("done"));
    129 	&cmp	("eax",2);
    130 	&je	(&label("clear_avx"));
    131 &set_label("clear_xmm");
    132 	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
    133 	&and	("esi",0xfeffffff);	# clear FXSR
    134 &set_label("clear_avx");
    135 	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
    136 &set_label("done");
    137 	&mov	("eax","esi");
    138 	&mov	("edx","ebp");
    139 &set_label("nocpuid");
    140 &function_end("OPENSSL_ia32_cpuid");
    141 
    142 &external_label("OPENSSL_ia32cap_P");
    143 
    144 &function_begin_B("OPENSSL_rdtsc","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
    145 	&xor	("eax","eax");
    146 	&xor	("edx","edx");
    147 	&picmeup("ecx","OPENSSL_ia32cap_P");
    148 	&bt	(&DWP(0,"ecx"),4);
    149 	&jnc	(&label("notsc"));
    150 	&rdtsc	();
    151 &set_label("notsc");
    152 	&ret	();
    153 &function_end_B("OPENSSL_rdtsc");
    154 
    155 # This works in Ring 0 only [read DJGPP+MS-DOS+privileged DPMI host],
    156 # but it's safe to call it on any [supported] 32-bit platform...
    157 # Just check for [non-]zero return value...
    158 &function_begin_B("OPENSSL_instrument_halt","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
    159 	&picmeup("ecx","OPENSSL_ia32cap_P");
    160 	&bt	(&DWP(0,"ecx"),4);
    161 	&jnc	(&label("nohalt"));	# no TSC
    162 
    163 	&data_word(0x9058900e);		# push %cs; pop %eax
    164 	&and	("eax",3);
    165 	&jnz	(&label("nohalt"));	# not enough privileges
    166 
    167 	&pushf	();
    168 	&pop	("eax");
    169 	&bt	("eax",9);
    170 	&jnc	(&label("nohalt"));	# interrupts are disabled
    171 
    172 	&rdtsc	();
    173 	&push	("edx");
    174 	&push	("eax");
    175 	&halt	();
    176 	&rdtsc	();
    177 
    178 	&sub	("eax",&DWP(0,"esp"));
    179 	&sbb	("edx",&DWP(4,"esp"));
    180 	&add	("esp",8);
    181 	&ret	();
    182 
    183 &set_label("nohalt");
    184 	&xor	("eax","eax");
    185 	&xor	("edx","edx");
    186 	&ret	();
    187 &function_end_B("OPENSSL_instrument_halt");
    188 
    189 # Essentially there is only one use for this function. Under DJGPP:
    190 #
    191 #	#include <go32.h>
    192 #	...
    193 #	i=OPENSSL_far_spin(_dos_ds,0x46c);
    194 #	...
    195 # to obtain the number of spins till closest timer interrupt.
    196 
    197 &function_begin_B("OPENSSL_far_spin");
    198 	&pushf	();
    199 	&pop	("eax")
    200 	&bt	("eax",9);
    201 	&jnc	(&label("nospin"));	# interrupts are disabled
    202 
    203 	&mov	("eax",&DWP(4,"esp"));
    204 	&mov	("ecx",&DWP(8,"esp"));
    205 	&data_word (0x90d88e1e);	# push %ds, mov %eax,%ds
    206 	&xor	("eax","eax");
    207 	&mov	("edx",&DWP(0,"ecx"));
    208 	&jmp	(&label("spin"));
    209 
    210 	&align	(16);
    211 &set_label("spin");
    212 	&inc	("eax");
    213 	&cmp	("edx",&DWP(0,"ecx"));
    214 	&je	(&label("spin"));
    215 
    216 	&data_word (0x1f909090);	# pop	%ds
    217 	&ret	();
    218 
    219 &set_label("nospin");
    220 	&xor	("eax","eax");
    221 	&xor	("edx","edx");
    222 	&ret	();
    223 &function_end_B("OPENSSL_far_spin");
    224 
    225 &function_begin_B("OPENSSL_wipe_cpu","EXTRN\t_OPENSSL_ia32cap_P:DWORD");
    226 	&xor	("eax","eax");
    227 	&xor	("edx","edx");
    228 	&picmeup("ecx","OPENSSL_ia32cap_P");
    229 	&mov	("ecx",&DWP(0,"ecx"));
    230 	&bt	(&DWP(0,"ecx"),1);
    231 	&jnc	(&label("no_x87"));
    232 	if ($sse2) {
    233 		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
    234 		&cmp	("ecx",1<<26|1<<24);
    235 		&jne	(&label("no_sse2"));
    236 		&pxor	("xmm0","xmm0");
    237 		&pxor	("xmm1","xmm1");
    238 		&pxor	("xmm2","xmm2");
    239 		&pxor	("xmm3","xmm3");
    240 		&pxor	("xmm4","xmm4");
    241 		&pxor	("xmm5","xmm5");
    242 		&pxor	("xmm6","xmm6");
    243 		&pxor	("xmm7","xmm7");
    244 	&set_label("no_sse2");
    245 	}
    246 	# just a bunch of fldz to zap the fp/mm bank followed by finit...
    247 	&data_word(0xeed9eed9,0xeed9eed9,0xeed9eed9,0xeed9eed9,0x90e3db9b);
    248 &set_label("no_x87");
    249 	&lea	("eax",&DWP(4,"esp"));
    250 	&ret	();
    251 &function_end_B("OPENSSL_wipe_cpu");
    252 
    253 &function_begin_B("OPENSSL_atomic_add");
    254 	&mov	("edx",&DWP(4,"esp"));	# fetch the pointer, 1st arg
    255 	&mov	("ecx",&DWP(8,"esp"));	# fetch the increment, 2nd arg
    256 	&push	("ebx");
    257 	&nop	();
    258 	&mov	("eax",&DWP(0,"edx"));
    259 &set_label("spin");
    260 	&lea	("ebx",&DWP(0,"eax","ecx"));
    261 	&nop	();
    262 	&data_word(0x1ab10ff0);	# lock;	cmpxchg	%ebx,(%edx)	# %eax is envolved and is always reloaded
    263 	&jne	(&label("spin"));
    264 	&mov	("eax","ebx");	# OpenSSL expects the new value
    265 	&pop	("ebx");
    266 	&ret	();
    267 &function_end_B("OPENSSL_atomic_add");
    268 
    269 # This function can become handy under Win32 in situations when
    270 # we don't know which calling convention, __stdcall or __cdecl(*),
    271 # indirect callee is using. In C it can be deployed as
    272 #
    273 #ifdef OPENSSL_CPUID_OBJ
    274 #	type OPENSSL_indirect_call(void *f,...);
    275 #	...
    276 #	OPENSSL_indirect_call(func,[up to $max arguments]);
    277 #endif
    278 #
    279 # (*)	it's designed to work even for __fastcall if number of
    280 #	arguments is 1 or 2!
    281 &function_begin_B("OPENSSL_indirect_call");
    282 	{
    283 	my ($max,$i)=(7,);	# $max has to be chosen as 4*n-1
    284 				# in order to preserve eventual
    285 				# stack alignment
    286 	&push	("ebp");
    287 	&mov	("ebp","esp");
    288 	&sub	("esp",$max*4);
    289 	&mov	("ecx",&DWP(12,"ebp"));
    290 	&mov	(&DWP(0,"esp"),"ecx");
    291 	&mov	("edx",&DWP(16,"ebp"));
    292 	&mov	(&DWP(4,"esp"),"edx");
    293 	for($i=2;$i<$max;$i++)
    294 		{
    295 		# Some copies will be redundant/bogus...
    296 		&mov	("eax",&DWP(12+$i*4,"ebp"));
    297 		&mov	(&DWP(0+$i*4,"esp"),"eax");
    298 		}
    299 	&call_ptr	(&DWP(8,"ebp"));# make the call...
    300 	&mov	("esp","ebp");	# ... and just restore the stack pointer
    301 				# without paying attention to what we called,
    302 				# (__cdecl *func) or (__stdcall *one).
    303 	&pop	("ebp");
    304 	&ret	();
    305 	}
    306 &function_end_B("OPENSSL_indirect_call");
    307 
    308 &function_begin_B("OPENSSL_cleanse");
    309 	&mov	("edx",&wparam(0));
    310 	&mov	("ecx",&wparam(1));
    311 	&xor	("eax","eax");
    312 	&cmp	("ecx",7);
    313 	&jae	(&label("lot"));
    314 	&cmp	("ecx",0);
    315 	&je	(&label("ret"));
    316 &set_label("little");
    317 	&mov	(&BP(0,"edx"),"al");
    318 	&sub	("ecx",1);
    319 	&lea	("edx",&DWP(1,"edx"));
    320 	&jnz	(&label("little"));
    321 &set_label("ret");
    322 	&ret	();
    323 
    324 &set_label("lot",16);
    325 	&test	("edx",3);
    326 	&jz	(&label("aligned"));
    327 	&mov	(&BP(0,"edx"),"al");
    328 	&lea	("ecx",&DWP(-1,"ecx"));
    329 	&lea	("edx",&DWP(1,"edx"));
    330 	&jmp	(&label("lot"));
    331 &set_label("aligned");
    332 	&mov	(&DWP(0,"edx"),"eax");
    333 	&lea	("ecx",&DWP(-4,"ecx"));
    334 	&test	("ecx",-4);
    335 	&lea	("edx",&DWP(4,"edx"));
    336 	&jnz	(&label("aligned"));
    337 	&cmp	("ecx",0);
    338 	&jne	(&label("little"));
    339 	&ret	();
    340 &function_end_B("OPENSSL_cleanse");
    341 
    342 &function_begin_B("OPENSSL_ia32_rdrand");
    343 	&mov	("ecx",8);
    344 &set_label("loop");
    345 	&rdrand	("eax");
    346 	&jc	(&label("break"));
    347 	&loop	(&label("loop"));
    348 &set_label("break");
    349 	&cmp	("eax",0);
    350 	&cmove	("eax","ecx");
    351 	&ret	();
    352 &function_end_B("OPENSSL_ia32_rdrand");
    353 
    354 &initseg("OPENSSL_cpuid_setup");
    355 
    356 &asm_finish();
    357