Home | History | Annotate | Download | only in fipsmodule
      1 %ifidn __OUTPUT_FORMAT__,obj
      2 section	code	use32 class=code align=64
      3 %elifidn __OUTPUT_FORMAT__,win32
      4 %ifdef __YASM_VERSION_ID__
      5 %if __YASM_VERSION_ID__ < 01010000h
      6 %error yasm version 1.1.0 or later needed.
      7 %endif
      8 ; Yasm automatically includes .00 and complains about redefining it.
      9 ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html
     10 %else
     11 $@feat.00 equ 1
     12 %endif
     13 section	.text	code align=64
     14 %else
     15 section	.text	code
     16 %endif
     17 global	_gcm_gmult_4bit_mmx
     18 align	16
     19 _gcm_gmult_4bit_mmx:
     20 L$_gcm_gmult_4bit_mmx_begin:
     21 	push	ebp
     22 	push	ebx
     23 	push	esi
     24 	push	edi
     25 	mov	edi,DWORD [20+esp]
     26 	mov	esi,DWORD [24+esp]
     27 	call	L$000pic_point
     28 L$000pic_point:
     29 	pop	eax
     30 	lea	eax,[(L$rem_4bit-L$000pic_point)+eax]
     31 	movzx	ebx,BYTE [15+edi]
     32 	xor	ecx,ecx
     33 	mov	edx,ebx
     34 	mov	cl,dl
     35 	mov	ebp,14
     36 	shl	cl,4
     37 	and	edx,240
     38 	movq	mm0,[8+ecx*1+esi]
     39 	movq	mm1,[ecx*1+esi]
     40 	movd	ebx,mm0
     41 	jmp	NEAR L$001mmx_loop
     42 align	16
     43 L$001mmx_loop:
     44 	psrlq	mm0,4
     45 	and	ebx,15
     46 	movq	mm2,mm1
     47 	psrlq	mm1,4
     48 	pxor	mm0,[8+edx*1+esi]
     49 	mov	cl,BYTE [ebp*1+edi]
     50 	psllq	mm2,60
     51 	pxor	mm1,[ebx*8+eax]
     52 	dec	ebp
     53 	movd	ebx,mm0
     54 	pxor	mm1,[edx*1+esi]
     55 	mov	edx,ecx
     56 	pxor	mm0,mm2
     57 	js	NEAR L$002mmx_break
     58 	shl	cl,4
     59 	and	ebx,15
     60 	psrlq	mm0,4
     61 	and	edx,240
     62 	movq	mm2,mm1
     63 	psrlq	mm1,4
     64 	pxor	mm0,[8+ecx*1+esi]
     65 	psllq	mm2,60
     66 	pxor	mm1,[ebx*8+eax]
     67 	movd	ebx,mm0
     68 	pxor	mm1,[ecx*1+esi]
     69 	pxor	mm0,mm2
     70 	jmp	NEAR L$001mmx_loop
     71 align	16
     72 L$002mmx_break:
     73 	shl	cl,4
     74 	and	ebx,15
     75 	psrlq	mm0,4
     76 	and	edx,240
     77 	movq	mm2,mm1
     78 	psrlq	mm1,4
     79 	pxor	mm0,[8+ecx*1+esi]
     80 	psllq	mm2,60
     81 	pxor	mm1,[ebx*8+eax]
     82 	movd	ebx,mm0
     83 	pxor	mm1,[ecx*1+esi]
     84 	pxor	mm0,mm2
     85 	psrlq	mm0,4
     86 	and	ebx,15
     87 	movq	mm2,mm1
     88 	psrlq	mm1,4
     89 	pxor	mm0,[8+edx*1+esi]
     90 	psllq	mm2,60
     91 	pxor	mm1,[ebx*8+eax]
     92 	movd	ebx,mm0
     93 	pxor	mm1,[edx*1+esi]
     94 	pxor	mm0,mm2
     95 	psrlq	mm0,32
     96 	movd	edx,mm1
     97 	psrlq	mm1,32
     98 	movd	ecx,mm0
     99 	movd	ebp,mm1
    100 	bswap	ebx
    101 	bswap	edx
    102 	bswap	ecx
    103 	bswap	ebp
    104 	emms
    105 	mov	DWORD [12+edi],ebx
    106 	mov	DWORD [4+edi],edx
    107 	mov	DWORD [8+edi],ecx
    108 	mov	DWORD [edi],ebp
    109 	pop	edi
    110 	pop	esi
    111 	pop	ebx
    112 	pop	ebp
    113 	ret
    114 global	_gcm_ghash_4bit_mmx
    115 align	16
    116 _gcm_ghash_4bit_mmx:
    117 L$_gcm_ghash_4bit_mmx_begin:
    118 	push	ebp
    119 	push	ebx
    120 	push	esi
    121 	push	edi
    122 	mov	eax,DWORD [20+esp]
    123 	mov	ebx,DWORD [24+esp]
    124 	mov	ecx,DWORD [28+esp]
    125 	mov	edx,DWORD [32+esp]
    126 	mov	ebp,esp
    127 	call	L$003pic_point
    128 L$003pic_point:
    129 	pop	esi
    130 	lea	esi,[(L$rem_8bit-L$003pic_point)+esi]
    131 	sub	esp,544
    132 	and	esp,-64
    133 	sub	esp,16
    134 	add	edx,ecx
    135 	mov	DWORD [544+esp],eax
    136 	mov	DWORD [552+esp],edx
    137 	mov	DWORD [556+esp],ebp
    138 	add	ebx,128
    139 	lea	edi,[144+esp]
    140 	lea	ebp,[400+esp]
    141 	mov	edx,DWORD [ebx-120]
    142 	movq	mm0,[ebx-120]
    143 	movq	mm3,[ebx-128]
    144 	shl	edx,4
    145 	mov	BYTE [esp],dl
    146 	mov	edx,DWORD [ebx-104]
    147 	movq	mm2,[ebx-104]
    148 	movq	mm5,[ebx-112]
    149 	movq	[edi-128],mm0
    150 	psrlq	mm0,4
    151 	movq	[edi],mm3
    152 	movq	mm7,mm3
    153 	psrlq	mm3,4
    154 	shl	edx,4
    155 	mov	BYTE [1+esp],dl
    156 	mov	edx,DWORD [ebx-88]
    157 	movq	mm1,[ebx-88]
    158 	psllq	mm7,60
    159 	movq	mm4,[ebx-96]
    160 	por	mm0,mm7
    161 	movq	[edi-120],mm2
    162 	psrlq	mm2,4
    163 	movq	[8+edi],mm5
    164 	movq	mm6,mm5
    165 	movq	[ebp-128],mm0
    166 	psrlq	mm5,4
    167 	movq	[ebp],mm3
    168 	shl	edx,4
    169 	mov	BYTE [2+esp],dl
    170 	mov	edx,DWORD [ebx-72]
    171 	movq	mm0,[ebx-72]
    172 	psllq	mm6,60
    173 	movq	mm3,[ebx-80]
    174 	por	mm2,mm6
    175 	movq	[edi-112],mm1
    176 	psrlq	mm1,4
    177 	movq	[16+edi],mm4
    178 	movq	mm7,mm4
    179 	movq	[ebp-120],mm2
    180 	psrlq	mm4,4
    181 	movq	[8+ebp],mm5
    182 	shl	edx,4
    183 	mov	BYTE [3+esp],dl
    184 	mov	edx,DWORD [ebx-56]
    185 	movq	mm2,[ebx-56]
    186 	psllq	mm7,60
    187 	movq	mm5,[ebx-64]
    188 	por	mm1,mm7
    189 	movq	[edi-104],mm0
    190 	psrlq	mm0,4
    191 	movq	[24+edi],mm3
    192 	movq	mm6,mm3
    193 	movq	[ebp-112],mm1
    194 	psrlq	mm3,4
    195 	movq	[16+ebp],mm4
    196 	shl	edx,4
    197 	mov	BYTE [4+esp],dl
    198 	mov	edx,DWORD [ebx-40]
    199 	movq	mm1,[ebx-40]
    200 	psllq	mm6,60
    201 	movq	mm4,[ebx-48]
    202 	por	mm0,mm6
    203 	movq	[edi-96],mm2
    204 	psrlq	mm2,4
    205 	movq	[32+edi],mm5
    206 	movq	mm7,mm5
    207 	movq	[ebp-104],mm0
    208 	psrlq	mm5,4
    209 	movq	[24+ebp],mm3
    210 	shl	edx,4
    211 	mov	BYTE [5+esp],dl
    212 	mov	edx,DWORD [ebx-24]
    213 	movq	mm0,[ebx-24]
    214 	psllq	mm7,60
    215 	movq	mm3,[ebx-32]
    216 	por	mm2,mm7
    217 	movq	[edi-88],mm1
    218 	psrlq	mm1,4
    219 	movq	[40+edi],mm4
    220 	movq	mm6,mm4
    221 	movq	[ebp-96],mm2
    222 	psrlq	mm4,4
    223 	movq	[32+ebp],mm5
    224 	shl	edx,4
    225 	mov	BYTE [6+esp],dl
    226 	mov	edx,DWORD [ebx-8]
    227 	movq	mm2,[ebx-8]
    228 	psllq	mm6,60
    229 	movq	mm5,[ebx-16]
    230 	por	mm1,mm6
    231 	movq	[edi-80],mm0
    232 	psrlq	mm0,4
    233 	movq	[48+edi],mm3
    234 	movq	mm7,mm3
    235 	movq	[ebp-88],mm1
    236 	psrlq	mm3,4
    237 	movq	[40+ebp],mm4
    238 	shl	edx,4
    239 	mov	BYTE [7+esp],dl
    240 	mov	edx,DWORD [8+ebx]
    241 	movq	mm1,[8+ebx]
    242 	psllq	mm7,60
    243 	movq	mm4,[ebx]
    244 	por	mm0,mm7
    245 	movq	[edi-72],mm2
    246 	psrlq	mm2,4
    247 	movq	[56+edi],mm5
    248 	movq	mm6,mm5
    249 	movq	[ebp-80],mm0
    250 	psrlq	mm5,4
    251 	movq	[48+ebp],mm3
    252 	shl	edx,4
    253 	mov	BYTE [8+esp],dl
    254 	mov	edx,DWORD [24+ebx]
    255 	movq	mm0,[24+ebx]
    256 	psllq	mm6,60
    257 	movq	mm3,[16+ebx]
    258 	por	mm2,mm6
    259 	movq	[edi-64],mm1
    260 	psrlq	mm1,4
    261 	movq	[64+edi],mm4
    262 	movq	mm7,mm4
    263 	movq	[ebp-72],mm2
    264 	psrlq	mm4,4
    265 	movq	[56+ebp],mm5
    266 	shl	edx,4
    267 	mov	BYTE [9+esp],dl
    268 	mov	edx,DWORD [40+ebx]
    269 	movq	mm2,[40+ebx]
    270 	psllq	mm7,60
    271 	movq	mm5,[32+ebx]
    272 	por	mm1,mm7
    273 	movq	[edi-56],mm0
    274 	psrlq	mm0,4
    275 	movq	[72+edi],mm3
    276 	movq	mm6,mm3
    277 	movq	[ebp-64],mm1
    278 	psrlq	mm3,4
    279 	movq	[64+ebp],mm4
    280 	shl	edx,4
    281 	mov	BYTE [10+esp],dl
    282 	mov	edx,DWORD [56+ebx]
    283 	movq	mm1,[56+ebx]
    284 	psllq	mm6,60
    285 	movq	mm4,[48+ebx]
    286 	por	mm0,mm6
    287 	movq	[edi-48],mm2
    288 	psrlq	mm2,4
    289 	movq	[80+edi],mm5
    290 	movq	mm7,mm5
    291 	movq	[ebp-56],mm0
    292 	psrlq	mm5,4
    293 	movq	[72+ebp],mm3
    294 	shl	edx,4
    295 	mov	BYTE [11+esp],dl
    296 	mov	edx,DWORD [72+ebx]
    297 	movq	mm0,[72+ebx]
    298 	psllq	mm7,60
    299 	movq	mm3,[64+ebx]
    300 	por	mm2,mm7
    301 	movq	[edi-40],mm1
    302 	psrlq	mm1,4
    303 	movq	[88+edi],mm4
    304 	movq	mm6,mm4
    305 	movq	[ebp-48],mm2
    306 	psrlq	mm4,4
    307 	movq	[80+ebp],mm5
    308 	shl	edx,4
    309 	mov	BYTE [12+esp],dl
    310 	mov	edx,DWORD [88+ebx]
    311 	movq	mm2,[88+ebx]
    312 	psllq	mm6,60
    313 	movq	mm5,[80+ebx]
    314 	por	mm1,mm6
    315 	movq	[edi-32],mm0
    316 	psrlq	mm0,4
    317 	movq	[96+edi],mm3
    318 	movq	mm7,mm3
    319 	movq	[ebp-40],mm1
    320 	psrlq	mm3,4
    321 	movq	[88+ebp],mm4
    322 	shl	edx,4
    323 	mov	BYTE [13+esp],dl
    324 	mov	edx,DWORD [104+ebx]
    325 	movq	mm1,[104+ebx]
    326 	psllq	mm7,60
    327 	movq	mm4,[96+ebx]
    328 	por	mm0,mm7
    329 	movq	[edi-24],mm2
    330 	psrlq	mm2,4
    331 	movq	[104+edi],mm5
    332 	movq	mm6,mm5
    333 	movq	[ebp-32],mm0
    334 	psrlq	mm5,4
    335 	movq	[96+ebp],mm3
    336 	shl	edx,4
    337 	mov	BYTE [14+esp],dl
    338 	mov	edx,DWORD [120+ebx]
    339 	movq	mm0,[120+ebx]
    340 	psllq	mm6,60
    341 	movq	mm3,[112+ebx]
    342 	por	mm2,mm6
    343 	movq	[edi-16],mm1
    344 	psrlq	mm1,4
    345 	movq	[112+edi],mm4
    346 	movq	mm7,mm4
    347 	movq	[ebp-24],mm2
    348 	psrlq	mm4,4
    349 	movq	[104+ebp],mm5
    350 	shl	edx,4
    351 	mov	BYTE [15+esp],dl
    352 	psllq	mm7,60
    353 	por	mm1,mm7
    354 	movq	[edi-8],mm0
    355 	psrlq	mm0,4
    356 	movq	[120+edi],mm3
    357 	movq	mm6,mm3
    358 	movq	[ebp-16],mm1
    359 	psrlq	mm3,4
    360 	movq	[112+ebp],mm4
    361 	psllq	mm6,60
    362 	por	mm0,mm6
    363 	movq	[ebp-8],mm0
    364 	movq	[120+ebp],mm3
    365 	movq	mm6,[eax]
    366 	mov	ebx,DWORD [8+eax]
    367 	mov	edx,DWORD [12+eax]
    368 align	16
    369 L$004outer:
    370 	xor	edx,DWORD [12+ecx]
    371 	xor	ebx,DWORD [8+ecx]
    372 	pxor	mm6,[ecx]
    373 	lea	ecx,[16+ecx]
    374 	mov	DWORD [536+esp],ebx
    375 	movq	[528+esp],mm6
    376 	mov	DWORD [548+esp],ecx
    377 	xor	eax,eax
    378 	rol	edx,8
    379 	mov	al,dl
    380 	mov	ebp,eax
    381 	and	al,15
    382 	shr	ebp,4
    383 	pxor	mm0,mm0
    384 	rol	edx,8
    385 	pxor	mm1,mm1
    386 	pxor	mm2,mm2
    387 	movq	mm7,[16+eax*8+esp]
    388 	movq	mm6,[144+eax*8+esp]
    389 	mov	al,dl
    390 	movd	ebx,mm7
    391 	psrlq	mm7,8
    392 	movq	mm3,mm6
    393 	mov	edi,eax
    394 	psrlq	mm6,8
    395 	pxor	mm7,[272+ebp*8+esp]
    396 	and	al,15
    397 	psllq	mm3,56
    398 	shr	edi,4
    399 	pxor	mm7,[16+eax*8+esp]
    400 	rol	edx,8
    401 	pxor	mm6,[144+eax*8+esp]
    402 	pxor	mm7,mm3
    403 	pxor	mm6,[400+ebp*8+esp]
    404 	xor	bl,BYTE [ebp*1+esp]
    405 	mov	al,dl
    406 	movd	ecx,mm7
    407 	movzx	ebx,bl
    408 	psrlq	mm7,8
    409 	movq	mm3,mm6
    410 	mov	ebp,eax
    411 	psrlq	mm6,8
    412 	pxor	mm7,[272+edi*8+esp]
    413 	and	al,15
    414 	psllq	mm3,56
    415 	shr	ebp,4
    416 	pinsrw	mm2,WORD [ebx*2+esi],2
    417 	pxor	mm7,[16+eax*8+esp]
    418 	rol	edx,8
    419 	pxor	mm6,[144+eax*8+esp]
    420 	pxor	mm7,mm3
    421 	pxor	mm6,[400+edi*8+esp]
    422 	xor	cl,BYTE [edi*1+esp]
    423 	mov	al,dl
    424 	mov	edx,DWORD [536+esp]
    425 	movd	ebx,mm7
    426 	movzx	ecx,cl
    427 	psrlq	mm7,8
    428 	movq	mm3,mm6
    429 	mov	edi,eax
    430 	psrlq	mm6,8
    431 	pxor	mm7,[272+ebp*8+esp]
    432 	and	al,15
    433 	psllq	mm3,56
    434 	pxor	mm6,mm2
    435 	shr	edi,4
    436 	pinsrw	mm1,WORD [ecx*2+esi],2
    437 	pxor	mm7,[16+eax*8+esp]
    438 	rol	edx,8
    439 	pxor	mm6,[144+eax*8+esp]
    440 	pxor	mm7,mm3
    441 	pxor	mm6,[400+ebp*8+esp]
    442 	xor	bl,BYTE [ebp*1+esp]
    443 	mov	al,dl
    444 	movd	ecx,mm7
    445 	movzx	ebx,bl
    446 	psrlq	mm7,8
    447 	movq	mm3,mm6
    448 	mov	ebp,eax
    449 	psrlq	mm6,8
    450 	pxor	mm7,[272+edi*8+esp]
    451 	and	al,15
    452 	psllq	mm3,56
    453 	pxor	mm6,mm1
    454 	shr	ebp,4
    455 	pinsrw	mm0,WORD [ebx*2+esi],2
    456 	pxor	mm7,[16+eax*8+esp]
    457 	rol	edx,8
    458 	pxor	mm6,[144+eax*8+esp]
    459 	pxor	mm7,mm3
    460 	pxor	mm6,[400+edi*8+esp]
    461 	xor	cl,BYTE [edi*1+esp]
    462 	mov	al,dl
    463 	movd	ebx,mm7
    464 	movzx	ecx,cl
    465 	psrlq	mm7,8
    466 	movq	mm3,mm6
    467 	mov	edi,eax
    468 	psrlq	mm6,8
    469 	pxor	mm7,[272+ebp*8+esp]
    470 	and	al,15
    471 	psllq	mm3,56
    472 	pxor	mm6,mm0
    473 	shr	edi,4
    474 	pinsrw	mm2,WORD [ecx*2+esi],2
    475 	pxor	mm7,[16+eax*8+esp]
    476 	rol	edx,8
    477 	pxor	mm6,[144+eax*8+esp]
    478 	pxor	mm7,mm3
    479 	pxor	mm6,[400+ebp*8+esp]
    480 	xor	bl,BYTE [ebp*1+esp]
    481 	mov	al,dl
    482 	movd	ecx,mm7
    483 	movzx	ebx,bl
    484 	psrlq	mm7,8
    485 	movq	mm3,mm6
    486 	mov	ebp,eax
    487 	psrlq	mm6,8
    488 	pxor	mm7,[272+edi*8+esp]
    489 	and	al,15
    490 	psllq	mm3,56
    491 	pxor	mm6,mm2
    492 	shr	ebp,4
    493 	pinsrw	mm1,WORD [ebx*2+esi],2
    494 	pxor	mm7,[16+eax*8+esp]
    495 	rol	edx,8
    496 	pxor	mm6,[144+eax*8+esp]
    497 	pxor	mm7,mm3
    498 	pxor	mm6,[400+edi*8+esp]
    499 	xor	cl,BYTE [edi*1+esp]
    500 	mov	al,dl
    501 	mov	edx,DWORD [532+esp]
    502 	movd	ebx,mm7
    503 	movzx	ecx,cl
    504 	psrlq	mm7,8
    505 	movq	mm3,mm6
    506 	mov	edi,eax
    507 	psrlq	mm6,8
    508 	pxor	mm7,[272+ebp*8+esp]
    509 	and	al,15
    510 	psllq	mm3,56
    511 	pxor	mm6,mm1
    512 	shr	edi,4
    513 	pinsrw	mm0,WORD [ecx*2+esi],2
    514 	pxor	mm7,[16+eax*8+esp]
    515 	rol	edx,8
    516 	pxor	mm6,[144+eax*8+esp]
    517 	pxor	mm7,mm3
    518 	pxor	mm6,[400+ebp*8+esp]
    519 	xor	bl,BYTE [ebp*1+esp]
    520 	mov	al,dl
    521 	movd	ecx,mm7
    522 	movzx	ebx,bl
    523 	psrlq	mm7,8
    524 	movq	mm3,mm6
    525 	mov	ebp,eax
    526 	psrlq	mm6,8
    527 	pxor	mm7,[272+edi*8+esp]
    528 	and	al,15
    529 	psllq	mm3,56
    530 	pxor	mm6,mm0
    531 	shr	ebp,4
    532 	pinsrw	mm2,WORD [ebx*2+esi],2
    533 	pxor	mm7,[16+eax*8+esp]
    534 	rol	edx,8
    535 	pxor	mm6,[144+eax*8+esp]
    536 	pxor	mm7,mm3
    537 	pxor	mm6,[400+edi*8+esp]
    538 	xor	cl,BYTE [edi*1+esp]
    539 	mov	al,dl
    540 	movd	ebx,mm7
    541 	movzx	ecx,cl
    542 	psrlq	mm7,8
    543 	movq	mm3,mm6
    544 	mov	edi,eax
    545 	psrlq	mm6,8
    546 	pxor	mm7,[272+ebp*8+esp]
    547 	and	al,15
    548 	psllq	mm3,56
    549 	pxor	mm6,mm2
    550 	shr	edi,4
    551 	pinsrw	mm1,WORD [ecx*2+esi],2
    552 	pxor	mm7,[16+eax*8+esp]
    553 	rol	edx,8
    554 	pxor	mm6,[144+eax*8+esp]
    555 	pxor	mm7,mm3
    556 	pxor	mm6,[400+ebp*8+esp]
    557 	xor	bl,BYTE [ebp*1+esp]
    558 	mov	al,dl
    559 	movd	ecx,mm7
    560 	movzx	ebx,bl
    561 	psrlq	mm7,8
    562 	movq	mm3,mm6
    563 	mov	ebp,eax
    564 	psrlq	mm6,8
    565 	pxor	mm7,[272+edi*8+esp]
    566 	and	al,15
    567 	psllq	mm3,56
    568 	pxor	mm6,mm1
    569 	shr	ebp,4
    570 	pinsrw	mm0,WORD [ebx*2+esi],2
    571 	pxor	mm7,[16+eax*8+esp]
    572 	rol	edx,8
    573 	pxor	mm6,[144+eax*8+esp]
    574 	pxor	mm7,mm3
    575 	pxor	mm6,[400+edi*8+esp]
    576 	xor	cl,BYTE [edi*1+esp]
    577 	mov	al,dl
    578 	mov	edx,DWORD [528+esp]
    579 	movd	ebx,mm7
    580 	movzx	ecx,cl
    581 	psrlq	mm7,8
    582 	movq	mm3,mm6
    583 	mov	edi,eax
    584 	psrlq	mm6,8
    585 	pxor	mm7,[272+ebp*8+esp]
    586 	and	al,15
    587 	psllq	mm3,56
    588 	pxor	mm6,mm0
    589 	shr	edi,4
    590 	pinsrw	mm2,WORD [ecx*2+esi],2
    591 	pxor	mm7,[16+eax*8+esp]
    592 	rol	edx,8
    593 	pxor	mm6,[144+eax*8+esp]
    594 	pxor	mm7,mm3
    595 	pxor	mm6,[400+ebp*8+esp]
    596 	xor	bl,BYTE [ebp*1+esp]
    597 	mov	al,dl
    598 	movd	ecx,mm7
    599 	movzx	ebx,bl
    600 	psrlq	mm7,8
    601 	movq	mm3,mm6
    602 	mov	ebp,eax
    603 	psrlq	mm6,8
    604 	pxor	mm7,[272+edi*8+esp]
    605 	and	al,15
    606 	psllq	mm3,56
    607 	pxor	mm6,mm2
    608 	shr	ebp,4
    609 	pinsrw	mm1,WORD [ebx*2+esi],2
    610 	pxor	mm7,[16+eax*8+esp]
    611 	rol	edx,8
    612 	pxor	mm6,[144+eax*8+esp]
    613 	pxor	mm7,mm3
    614 	pxor	mm6,[400+edi*8+esp]
    615 	xor	cl,BYTE [edi*1+esp]
    616 	mov	al,dl
    617 	movd	ebx,mm7
    618 	movzx	ecx,cl
    619 	psrlq	mm7,8
    620 	movq	mm3,mm6
    621 	mov	edi,eax
    622 	psrlq	mm6,8
    623 	pxor	mm7,[272+ebp*8+esp]
    624 	and	al,15
    625 	psllq	mm3,56
    626 	pxor	mm6,mm1
    627 	shr	edi,4
    628 	pinsrw	mm0,WORD [ecx*2+esi],2
    629 	pxor	mm7,[16+eax*8+esp]
    630 	rol	edx,8
    631 	pxor	mm6,[144+eax*8+esp]
    632 	pxor	mm7,mm3
    633 	pxor	mm6,[400+ebp*8+esp]
    634 	xor	bl,BYTE [ebp*1+esp]
    635 	mov	al,dl
    636 	movd	ecx,mm7
    637 	movzx	ebx,bl
    638 	psrlq	mm7,8
    639 	movq	mm3,mm6
    640 	mov	ebp,eax
    641 	psrlq	mm6,8
    642 	pxor	mm7,[272+edi*8+esp]
    643 	and	al,15
    644 	psllq	mm3,56
    645 	pxor	mm6,mm0
    646 	shr	ebp,4
    647 	pinsrw	mm2,WORD [ebx*2+esi],2
    648 	pxor	mm7,[16+eax*8+esp]
    649 	rol	edx,8
    650 	pxor	mm6,[144+eax*8+esp]
    651 	pxor	mm7,mm3
    652 	pxor	mm6,[400+edi*8+esp]
    653 	xor	cl,BYTE [edi*1+esp]
    654 	mov	al,dl
    655 	mov	edx,DWORD [524+esp]
    656 	movd	ebx,mm7
    657 	movzx	ecx,cl
    658 	psrlq	mm7,8
    659 	movq	mm3,mm6
    660 	mov	edi,eax
    661 	psrlq	mm6,8
    662 	pxor	mm7,[272+ebp*8+esp]
    663 	and	al,15
    664 	psllq	mm3,56
    665 	pxor	mm6,mm2
    666 	shr	edi,4
    667 	pinsrw	mm1,WORD [ecx*2+esi],2
    668 	pxor	mm7,[16+eax*8+esp]
    669 	pxor	mm6,[144+eax*8+esp]
    670 	xor	bl,BYTE [ebp*1+esp]
    671 	pxor	mm7,mm3
    672 	pxor	mm6,[400+ebp*8+esp]
    673 	movzx	ebx,bl
    674 	pxor	mm2,mm2
    675 	psllq	mm1,4
    676 	movd	ecx,mm7
    677 	psrlq	mm7,4
    678 	movq	mm3,mm6
    679 	psrlq	mm6,4
    680 	shl	ecx,4
    681 	pxor	mm7,[16+edi*8+esp]
    682 	psllq	mm3,60
    683 	movzx	ecx,cl
    684 	pxor	mm7,mm3
    685 	pxor	mm6,[144+edi*8+esp]
    686 	pinsrw	mm0,WORD [ebx*2+esi],2
    687 	pxor	mm6,mm1
    688 	movd	edx,mm7
    689 	pinsrw	mm2,WORD [ecx*2+esi],3
    690 	psllq	mm0,12
    691 	pxor	mm6,mm0
    692 	psrlq	mm7,32
    693 	pxor	mm6,mm2
    694 	mov	ecx,DWORD [548+esp]
    695 	movd	ebx,mm7
    696 	movq	mm3,mm6
    697 	psllw	mm6,8
    698 	psrlw	mm3,8
    699 	por	mm6,mm3
    700 	bswap	edx
    701 	pshufw	mm6,mm6,27
    702 	bswap	ebx
    703 	cmp	ecx,DWORD [552+esp]
    704 	jne	NEAR L$004outer
    705 	mov	eax,DWORD [544+esp]
    706 	mov	DWORD [12+eax],edx
    707 	mov	DWORD [8+eax],ebx
    708 	movq	[eax],mm6
    709 	mov	esp,DWORD [556+esp]
    710 	emms
    711 	pop	edi
    712 	pop	esi
    713 	pop	ebx
    714 	pop	ebp
    715 	ret
    716 global	_gcm_init_clmul
    717 align	16
    718 _gcm_init_clmul:
    719 L$_gcm_init_clmul_begin:
    720 	mov	edx,DWORD [4+esp]
    721 	mov	eax,DWORD [8+esp]
    722 	call	L$005pic
    723 L$005pic:
    724 	pop	ecx
    725 	lea	ecx,[(L$bswap-L$005pic)+ecx]
    726 	movdqu	xmm2,[eax]
    727 	pshufd	xmm2,xmm2,78
    728 	pshufd	xmm4,xmm2,255
    729 	movdqa	xmm3,xmm2
    730 	psllq	xmm2,1
    731 	pxor	xmm5,xmm5
    732 	psrlq	xmm3,63
    733 	pcmpgtd	xmm5,xmm4
    734 	pslldq	xmm3,8
    735 	por	xmm2,xmm3
    736 	pand	xmm5,[16+ecx]
    737 	pxor	xmm2,xmm5
    738 	movdqa	xmm0,xmm2
    739 	movdqa	xmm1,xmm0
    740 	pshufd	xmm3,xmm0,78
    741 	pshufd	xmm4,xmm2,78
    742 	pxor	xmm3,xmm0
    743 	pxor	xmm4,xmm2
    744 db	102,15,58,68,194,0
    745 db	102,15,58,68,202,17
    746 db	102,15,58,68,220,0
    747 	xorps	xmm3,xmm0
    748 	xorps	xmm3,xmm1
    749 	movdqa	xmm4,xmm3
    750 	psrldq	xmm3,8
    751 	pslldq	xmm4,8
    752 	pxor	xmm1,xmm3
    753 	pxor	xmm0,xmm4
    754 	movdqa	xmm4,xmm0
    755 	movdqa	xmm3,xmm0
    756 	psllq	xmm0,5
    757 	pxor	xmm3,xmm0
    758 	psllq	xmm0,1
    759 	pxor	xmm0,xmm3
    760 	psllq	xmm0,57
    761 	movdqa	xmm3,xmm0
    762 	pslldq	xmm0,8
    763 	psrldq	xmm3,8
    764 	pxor	xmm0,xmm4
    765 	pxor	xmm1,xmm3
    766 	movdqa	xmm4,xmm0
    767 	psrlq	xmm0,1
    768 	pxor	xmm1,xmm4
    769 	pxor	xmm4,xmm0
    770 	psrlq	xmm0,5
    771 	pxor	xmm0,xmm4
    772 	psrlq	xmm0,1
    773 	pxor	xmm0,xmm1
    774 	pshufd	xmm3,xmm2,78
    775 	pshufd	xmm4,xmm0,78
    776 	pxor	xmm3,xmm2
    777 	movdqu	[edx],xmm2
    778 	pxor	xmm4,xmm0
    779 	movdqu	[16+edx],xmm0
    780 db	102,15,58,15,227,8
    781 	movdqu	[32+edx],xmm4
    782 	ret
    783 global	_gcm_gmult_clmul
    784 align	16
    785 _gcm_gmult_clmul:
    786 L$_gcm_gmult_clmul_begin:
    787 	mov	eax,DWORD [4+esp]
    788 	mov	edx,DWORD [8+esp]
    789 	call	L$006pic
    790 L$006pic:
    791 	pop	ecx
    792 	lea	ecx,[(L$bswap-L$006pic)+ecx]
    793 	movdqu	xmm0,[eax]
    794 	movdqa	xmm5,[ecx]
    795 	movups	xmm2,[edx]
    796 db	102,15,56,0,197
    797 	movups	xmm4,[32+edx]
    798 	movdqa	xmm1,xmm0
    799 	pshufd	xmm3,xmm0,78
    800 	pxor	xmm3,xmm0
    801 db	102,15,58,68,194,0
    802 db	102,15,58,68,202,17
    803 db	102,15,58,68,220,0
    804 	xorps	xmm3,xmm0
    805 	xorps	xmm3,xmm1
    806 	movdqa	xmm4,xmm3
    807 	psrldq	xmm3,8
    808 	pslldq	xmm4,8
    809 	pxor	xmm1,xmm3
    810 	pxor	xmm0,xmm4
    811 	movdqa	xmm4,xmm0
    812 	movdqa	xmm3,xmm0
    813 	psllq	xmm0,5
    814 	pxor	xmm3,xmm0
    815 	psllq	xmm0,1
    816 	pxor	xmm0,xmm3
    817 	psllq	xmm0,57
    818 	movdqa	xmm3,xmm0
    819 	pslldq	xmm0,8
    820 	psrldq	xmm3,8
    821 	pxor	xmm0,xmm4
    822 	pxor	xmm1,xmm3
    823 	movdqa	xmm4,xmm0
    824 	psrlq	xmm0,1
    825 	pxor	xmm1,xmm4
    826 	pxor	xmm4,xmm0
    827 	psrlq	xmm0,5
    828 	pxor	xmm0,xmm4
    829 	psrlq	xmm0,1
    830 	pxor	xmm0,xmm1
    831 db	102,15,56,0,197
    832 	movdqu	[eax],xmm0
    833 	ret
    834 global	_gcm_ghash_clmul
    835 align	16
    836 _gcm_ghash_clmul:
    837 L$_gcm_ghash_clmul_begin:
    838 	push	ebp
    839 	push	ebx
    840 	push	esi
    841 	push	edi
    842 	mov	eax,DWORD [20+esp]
    843 	mov	edx,DWORD [24+esp]
    844 	mov	esi,DWORD [28+esp]
    845 	mov	ebx,DWORD [32+esp]
    846 	call	L$007pic
    847 L$007pic:
    848 	pop	ecx
    849 	lea	ecx,[(L$bswap-L$007pic)+ecx]
    850 	movdqu	xmm0,[eax]
    851 	movdqa	xmm5,[ecx]
    852 	movdqu	xmm2,[edx]
    853 db	102,15,56,0,197
    854 	sub	ebx,16
    855 	jz	NEAR L$008odd_tail
    856 	movdqu	xmm3,[esi]
    857 	movdqu	xmm6,[16+esi]
    858 db	102,15,56,0,221
    859 db	102,15,56,0,245
    860 	movdqu	xmm5,[32+edx]
    861 	pxor	xmm0,xmm3
    862 	pshufd	xmm3,xmm6,78
    863 	movdqa	xmm7,xmm6
    864 	pxor	xmm3,xmm6
    865 	lea	esi,[32+esi]
    866 db	102,15,58,68,242,0
    867 db	102,15,58,68,250,17
    868 db	102,15,58,68,221,0
    869 	movups	xmm2,[16+edx]
    870 	nop
    871 	sub	ebx,32
    872 	jbe	NEAR L$009even_tail
    873 	jmp	NEAR L$010mod_loop
    874 align	32
    875 L$010mod_loop:
    876 	pshufd	xmm4,xmm0,78
    877 	movdqa	xmm1,xmm0
    878 	pxor	xmm4,xmm0
    879 	nop
    880 db	102,15,58,68,194,0
    881 db	102,15,58,68,202,17
    882 db	102,15,58,68,229,16
    883 	movups	xmm2,[edx]
    884 	xorps	xmm0,xmm6
    885 	movdqa	xmm5,[ecx]
    886 	xorps	xmm1,xmm7
    887 	movdqu	xmm7,[esi]
    888 	pxor	xmm3,xmm0
    889 	movdqu	xmm6,[16+esi]
    890 	pxor	xmm3,xmm1
    891 db	102,15,56,0,253
    892 	pxor	xmm4,xmm3
    893 	movdqa	xmm3,xmm4
    894 	psrldq	xmm4,8
    895 	pslldq	xmm3,8
    896 	pxor	xmm1,xmm4
    897 	pxor	xmm0,xmm3
    898 db	102,15,56,0,245
    899 	pxor	xmm1,xmm7
    900 	movdqa	xmm7,xmm6
    901 	movdqa	xmm4,xmm0
    902 	movdqa	xmm3,xmm0
    903 	psllq	xmm0,5
    904 	pxor	xmm3,xmm0
    905 	psllq	xmm0,1
    906 	pxor	xmm0,xmm3
    907 db	102,15,58,68,242,0
    908 	movups	xmm5,[32+edx]
    909 	psllq	xmm0,57
    910 	movdqa	xmm3,xmm0
    911 	pslldq	xmm0,8
    912 	psrldq	xmm3,8
    913 	pxor	xmm0,xmm4
    914 	pxor	xmm1,xmm3
    915 	pshufd	xmm3,xmm7,78
    916 	movdqa	xmm4,xmm0
    917 	psrlq	xmm0,1
    918 	pxor	xmm3,xmm7
    919 	pxor	xmm1,xmm4
    920 db	102,15,58,68,250,17
    921 	movups	xmm2,[16+edx]
    922 	pxor	xmm4,xmm0
    923 	psrlq	xmm0,5
    924 	pxor	xmm0,xmm4
    925 	psrlq	xmm0,1
    926 	pxor	xmm0,xmm1
    927 db	102,15,58,68,221,0
    928 	lea	esi,[32+esi]
    929 	sub	ebx,32
    930 	ja	NEAR L$010mod_loop
    931 L$009even_tail:
    932 	pshufd	xmm4,xmm0,78
    933 	movdqa	xmm1,xmm0
    934 	pxor	xmm4,xmm0
    935 db	102,15,58,68,194,0
    936 db	102,15,58,68,202,17
    937 db	102,15,58,68,229,16
    938 	movdqa	xmm5,[ecx]
    939 	xorps	xmm0,xmm6
    940 	xorps	xmm1,xmm7
    941 	pxor	xmm3,xmm0
    942 	pxor	xmm3,xmm1
    943 	pxor	xmm4,xmm3
    944 	movdqa	xmm3,xmm4
    945 	psrldq	xmm4,8
    946 	pslldq	xmm3,8
    947 	pxor	xmm1,xmm4
    948 	pxor	xmm0,xmm3
    949 	movdqa	xmm4,xmm0
    950 	movdqa	xmm3,xmm0
    951 	psllq	xmm0,5
    952 	pxor	xmm3,xmm0
    953 	psllq	xmm0,1
    954 	pxor	xmm0,xmm3
    955 	psllq	xmm0,57
    956 	movdqa	xmm3,xmm0
    957 	pslldq	xmm0,8
    958 	psrldq	xmm3,8
    959 	pxor	xmm0,xmm4
    960 	pxor	xmm1,xmm3
    961 	movdqa	xmm4,xmm0
    962 	psrlq	xmm0,1
    963 	pxor	xmm1,xmm4
    964 	pxor	xmm4,xmm0
    965 	psrlq	xmm0,5
    966 	pxor	xmm0,xmm4
    967 	psrlq	xmm0,1
    968 	pxor	xmm0,xmm1
    969 	test	ebx,ebx
    970 	jnz	NEAR L$011done
    971 	movups	xmm2,[edx]
    972 L$008odd_tail:
    973 	movdqu	xmm3,[esi]
    974 db	102,15,56,0,221
    975 	pxor	xmm0,xmm3
    976 	movdqa	xmm1,xmm0
    977 	pshufd	xmm3,xmm0,78
    978 	pshufd	xmm4,xmm2,78
    979 	pxor	xmm3,xmm0
    980 	pxor	xmm4,xmm2
    981 db	102,15,58,68,194,0
    982 db	102,15,58,68,202,17
    983 db	102,15,58,68,220,0
    984 	xorps	xmm3,xmm0
    985 	xorps	xmm3,xmm1
    986 	movdqa	xmm4,xmm3
    987 	psrldq	xmm3,8
    988 	pslldq	xmm4,8
    989 	pxor	xmm1,xmm3
    990 	pxor	xmm0,xmm4
    991 	movdqa	xmm4,xmm0
    992 	movdqa	xmm3,xmm0
    993 	psllq	xmm0,5
    994 	pxor	xmm3,xmm0
    995 	psllq	xmm0,1
    996 	pxor	xmm0,xmm3
    997 	psllq	xmm0,57
    998 	movdqa	xmm3,xmm0
    999 	pslldq	xmm0,8
   1000 	psrldq	xmm3,8
   1001 	pxor	xmm0,xmm4
   1002 	pxor	xmm1,xmm3
   1003 	movdqa	xmm4,xmm0
   1004 	psrlq	xmm0,1
   1005 	pxor	xmm1,xmm4
   1006 	pxor	xmm4,xmm0
   1007 	psrlq	xmm0,5
   1008 	pxor	xmm0,xmm4
   1009 	psrlq	xmm0,1
   1010 	pxor	xmm0,xmm1
   1011 L$011done:
   1012 db	102,15,56,0,197
   1013 	movdqu	[eax],xmm0
   1014 	pop	edi
   1015 	pop	esi
   1016 	pop	ebx
   1017 	pop	ebp
   1018 	ret
   1019 align	64
   1020 L$bswap:
   1021 db	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
   1022 db	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
   1023 align	64
   1024 L$rem_8bit:
   1025 dw	0,450,900,582,1800,1738,1164,1358
   1026 dw	3600,4050,3476,3158,2328,2266,2716,2910
   1027 dw	7200,7650,8100,7782,6952,6890,6316,6510
   1028 dw	4656,5106,4532,4214,5432,5370,5820,6014
   1029 dw	14400,14722,15300,14854,16200,16010,15564,15630
   1030 dw	13904,14226,13780,13334,12632,12442,13020,13086
   1031 dw	9312,9634,10212,9766,9064,8874,8428,8494
   1032 dw	10864,11186,10740,10294,11640,11450,12028,12094
   1033 dw	28800,28994,29444,29382,30600,30282,29708,30158
   1034 dw	32400,32594,32020,31958,31128,30810,31260,31710
   1035 dw	27808,28002,28452,28390,27560,27242,26668,27118
   1036 dw	25264,25458,24884,24822,26040,25722,26172,26622
   1037 dw	18624,18690,19268,19078,20424,19978,19532,19854
   1038 dw	18128,18194,17748,17558,16856,16410,16988,17310
   1039 dw	21728,21794,22372,22182,21480,21034,20588,20910
   1040 dw	23280,23346,22900,22710,24056,23610,24188,24510
   1041 dw	57600,57538,57988,58182,58888,59338,58764,58446
   1042 dw	61200,61138,60564,60758,59416,59866,60316,59998
   1043 dw	64800,64738,65188,65382,64040,64490,63916,63598
   1044 dw	62256,62194,61620,61814,62520,62970,63420,63102
   1045 dw	55616,55426,56004,56070,56904,57226,56780,56334
   1046 dw	55120,54930,54484,54550,53336,53658,54236,53790
   1047 dw	50528,50338,50916,50982,49768,50090,49644,49198
   1048 dw	52080,51890,51444,51510,52344,52666,53244,52798
   1049 dw	37248,36930,37380,37830,38536,38730,38156,38094
   1050 dw	40848,40530,39956,40406,39064,39258,39708,39646
   1051 dw	36256,35938,36388,36838,35496,35690,35116,35054
   1052 dw	33712,33394,32820,33270,33976,34170,34620,34558
   1053 dw	43456,43010,43588,43910,44744,44810,44364,44174
   1054 dw	42960,42514,42068,42390,41176,41242,41820,41630
   1055 dw	46560,46114,46692,47014,45800,45866,45420,45230
   1056 dw	48112,47666,47220,47542,48376,48442,49020,48830
   1057 align	64
   1058 L$rem_4bit:
   1059 dd	0,0,0,471859200,0,943718400,0,610271232
   1060 dd	0,1887436800,0,1822425088,0,1220542464,0,1423966208
   1061 dd	0,3774873600,0,4246732800,0,3644850176,0,3311403008
   1062 dd	0,2441084928,0,2376073216,0,2847932416,0,3051356160
   1063 db	71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
   1064 db	82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
   1065 db	112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
   1066 db	0
   1067