Home | History | Annotate | Download | only in fipsmodule
      1 #if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
      2 .text
      3 
      4 
      5 .globl	_gcm_gmult_4bit
      6 .private_extern _gcm_gmult_4bit
      7 
      8 .p2align	4
      9 _gcm_gmult_4bit:
     10 	pushq	%rbx
     11 	pushq	%rbp
     12 	pushq	%r12
     13 	pushq	%r13
     14 	pushq	%r14
     15 	pushq	%r15
     16 	subq	$280,%rsp
     17 L$gmult_prologue:
     18 
     19 	movzbq	15(%rdi),%r8
     20 	leaq	L$rem_4bit(%rip),%r11
     21 	xorq	%rax,%rax
     22 	xorq	%rbx,%rbx
     23 	movb	%r8b,%al
     24 	movb	%r8b,%bl
     25 	shlb	$4,%al
     26 	movq	$14,%rcx
     27 	movq	8(%rsi,%rax,1),%r8
     28 	movq	(%rsi,%rax,1),%r9
     29 	andb	$0xf0,%bl
     30 	movq	%r8,%rdx
     31 	jmp	L$oop1
     32 
     33 .p2align	4
     34 L$oop1:
     35 	shrq	$4,%r8
     36 	andq	$0xf,%rdx
     37 	movq	%r9,%r10
     38 	movb	(%rdi,%rcx,1),%al
     39 	shrq	$4,%r9
     40 	xorq	8(%rsi,%rbx,1),%r8
     41 	shlq	$60,%r10
     42 	xorq	(%rsi,%rbx,1),%r9
     43 	movb	%al,%bl
     44 	xorq	(%r11,%rdx,8),%r9
     45 	movq	%r8,%rdx
     46 	shlb	$4,%al
     47 	xorq	%r10,%r8
     48 	decq	%rcx
     49 	js	L$break1
     50 
     51 	shrq	$4,%r8
     52 	andq	$0xf,%rdx
     53 	movq	%r9,%r10
     54 	shrq	$4,%r9
     55 	xorq	8(%rsi,%rax,1),%r8
     56 	shlq	$60,%r10
     57 	xorq	(%rsi,%rax,1),%r9
     58 	andb	$0xf0,%bl
     59 	xorq	(%r11,%rdx,8),%r9
     60 	movq	%r8,%rdx
     61 	xorq	%r10,%r8
     62 	jmp	L$oop1
     63 
     64 .p2align	4
     65 L$break1:
     66 	shrq	$4,%r8
     67 	andq	$0xf,%rdx
     68 	movq	%r9,%r10
     69 	shrq	$4,%r9
     70 	xorq	8(%rsi,%rax,1),%r8
     71 	shlq	$60,%r10
     72 	xorq	(%rsi,%rax,1),%r9
     73 	andb	$0xf0,%bl
     74 	xorq	(%r11,%rdx,8),%r9
     75 	movq	%r8,%rdx
     76 	xorq	%r10,%r8
     77 
     78 	shrq	$4,%r8
     79 	andq	$0xf,%rdx
     80 	movq	%r9,%r10
     81 	shrq	$4,%r9
     82 	xorq	8(%rsi,%rbx,1),%r8
     83 	shlq	$60,%r10
     84 	xorq	(%rsi,%rbx,1),%r9
     85 	xorq	%r10,%r8
     86 	xorq	(%r11,%rdx,8),%r9
     87 
     88 	bswapq	%r8
     89 	bswapq	%r9
     90 	movq	%r8,8(%rdi)
     91 	movq	%r9,(%rdi)
     92 
     93 	leaq	280+48(%rsp),%rsi
     94 	movq	-8(%rsi),%rbx
     95 	leaq	(%rsi),%rsp
     96 L$gmult_epilogue:
     97 	.byte	0xf3,0xc3
     98 
     99 .globl	_gcm_ghash_4bit
    100 .private_extern _gcm_ghash_4bit
    101 
    102 .p2align	4
    103 _gcm_ghash_4bit:
    104 	pushq	%rbx
    105 	pushq	%rbp
    106 	pushq	%r12
    107 	pushq	%r13
    108 	pushq	%r14
    109 	pushq	%r15
    110 	subq	$280,%rsp
    111 L$ghash_prologue:
    112 	movq	%rdx,%r14
    113 	movq	%rcx,%r15
    114 	subq	$-128,%rsi
    115 	leaq	16+128(%rsp),%rbp
    116 	xorl	%edx,%edx
    117 	movq	0+0-128(%rsi),%r8
    118 	movq	0+8-128(%rsi),%rax
    119 	movb	%al,%dl
    120 	shrq	$4,%rax
    121 	movq	%r8,%r10
    122 	shrq	$4,%r8
    123 	movq	16+0-128(%rsi),%r9
    124 	shlb	$4,%dl
    125 	movq	16+8-128(%rsi),%rbx
    126 	shlq	$60,%r10
    127 	movb	%dl,0(%rsp)
    128 	orq	%r10,%rax
    129 	movb	%bl,%dl
    130 	shrq	$4,%rbx
    131 	movq	%r9,%r10
    132 	shrq	$4,%r9
    133 	movq	%r8,0(%rbp)
    134 	movq	32+0-128(%rsi),%r8
    135 	shlb	$4,%dl
    136 	movq	%rax,0-128(%rbp)
    137 	movq	32+8-128(%rsi),%rax
    138 	shlq	$60,%r10
    139 	movb	%dl,1(%rsp)
    140 	orq	%r10,%rbx
    141 	movb	%al,%dl
    142 	shrq	$4,%rax
    143 	movq	%r8,%r10
    144 	shrq	$4,%r8
    145 	movq	%r9,8(%rbp)
    146 	movq	48+0-128(%rsi),%r9
    147 	shlb	$4,%dl
    148 	movq	%rbx,8-128(%rbp)
    149 	movq	48+8-128(%rsi),%rbx
    150 	shlq	$60,%r10
    151 	movb	%dl,2(%rsp)
    152 	orq	%r10,%rax
    153 	movb	%bl,%dl
    154 	shrq	$4,%rbx
    155 	movq	%r9,%r10
    156 	shrq	$4,%r9
    157 	movq	%r8,16(%rbp)
    158 	movq	64+0-128(%rsi),%r8
    159 	shlb	$4,%dl
    160 	movq	%rax,16-128(%rbp)
    161 	movq	64+8-128(%rsi),%rax
    162 	shlq	$60,%r10
    163 	movb	%dl,3(%rsp)
    164 	orq	%r10,%rbx
    165 	movb	%al,%dl
    166 	shrq	$4,%rax
    167 	movq	%r8,%r10
    168 	shrq	$4,%r8
    169 	movq	%r9,24(%rbp)
    170 	movq	80+0-128(%rsi),%r9
    171 	shlb	$4,%dl
    172 	movq	%rbx,24-128(%rbp)
    173 	movq	80+8-128(%rsi),%rbx
    174 	shlq	$60,%r10
    175 	movb	%dl,4(%rsp)
    176 	orq	%r10,%rax
    177 	movb	%bl,%dl
    178 	shrq	$4,%rbx
    179 	movq	%r9,%r10
    180 	shrq	$4,%r9
    181 	movq	%r8,32(%rbp)
    182 	movq	96+0-128(%rsi),%r8
    183 	shlb	$4,%dl
    184 	movq	%rax,32-128(%rbp)
    185 	movq	96+8-128(%rsi),%rax
    186 	shlq	$60,%r10
    187 	movb	%dl,5(%rsp)
    188 	orq	%r10,%rbx
    189 	movb	%al,%dl
    190 	shrq	$4,%rax
    191 	movq	%r8,%r10
    192 	shrq	$4,%r8
    193 	movq	%r9,40(%rbp)
    194 	movq	112+0-128(%rsi),%r9
    195 	shlb	$4,%dl
    196 	movq	%rbx,40-128(%rbp)
    197 	movq	112+8-128(%rsi),%rbx
    198 	shlq	$60,%r10
    199 	movb	%dl,6(%rsp)
    200 	orq	%r10,%rax
    201 	movb	%bl,%dl
    202 	shrq	$4,%rbx
    203 	movq	%r9,%r10
    204 	shrq	$4,%r9
    205 	movq	%r8,48(%rbp)
    206 	movq	128+0-128(%rsi),%r8
    207 	shlb	$4,%dl
    208 	movq	%rax,48-128(%rbp)
    209 	movq	128+8-128(%rsi),%rax
    210 	shlq	$60,%r10
    211 	movb	%dl,7(%rsp)
    212 	orq	%r10,%rbx
    213 	movb	%al,%dl
    214 	shrq	$4,%rax
    215 	movq	%r8,%r10
    216 	shrq	$4,%r8
    217 	movq	%r9,56(%rbp)
    218 	movq	144+0-128(%rsi),%r9
    219 	shlb	$4,%dl
    220 	movq	%rbx,56-128(%rbp)
    221 	movq	144+8-128(%rsi),%rbx
    222 	shlq	$60,%r10
    223 	movb	%dl,8(%rsp)
    224 	orq	%r10,%rax
    225 	movb	%bl,%dl
    226 	shrq	$4,%rbx
    227 	movq	%r9,%r10
    228 	shrq	$4,%r9
    229 	movq	%r8,64(%rbp)
    230 	movq	160+0-128(%rsi),%r8
    231 	shlb	$4,%dl
    232 	movq	%rax,64-128(%rbp)
    233 	movq	160+8-128(%rsi),%rax
    234 	shlq	$60,%r10
    235 	movb	%dl,9(%rsp)
    236 	orq	%r10,%rbx
    237 	movb	%al,%dl
    238 	shrq	$4,%rax
    239 	movq	%r8,%r10
    240 	shrq	$4,%r8
    241 	movq	%r9,72(%rbp)
    242 	movq	176+0-128(%rsi),%r9
    243 	shlb	$4,%dl
    244 	movq	%rbx,72-128(%rbp)
    245 	movq	176+8-128(%rsi),%rbx
    246 	shlq	$60,%r10
    247 	movb	%dl,10(%rsp)
    248 	orq	%r10,%rax
    249 	movb	%bl,%dl
    250 	shrq	$4,%rbx
    251 	movq	%r9,%r10
    252 	shrq	$4,%r9
    253 	movq	%r8,80(%rbp)
    254 	movq	192+0-128(%rsi),%r8
    255 	shlb	$4,%dl
    256 	movq	%rax,80-128(%rbp)
    257 	movq	192+8-128(%rsi),%rax
    258 	shlq	$60,%r10
    259 	movb	%dl,11(%rsp)
    260 	orq	%r10,%rbx
    261 	movb	%al,%dl
    262 	shrq	$4,%rax
    263 	movq	%r8,%r10
    264 	shrq	$4,%r8
    265 	movq	%r9,88(%rbp)
    266 	movq	208+0-128(%rsi),%r9
    267 	shlb	$4,%dl
    268 	movq	%rbx,88-128(%rbp)
    269 	movq	208+8-128(%rsi),%rbx
    270 	shlq	$60,%r10
    271 	movb	%dl,12(%rsp)
    272 	orq	%r10,%rax
    273 	movb	%bl,%dl
    274 	shrq	$4,%rbx
    275 	movq	%r9,%r10
    276 	shrq	$4,%r9
    277 	movq	%r8,96(%rbp)
    278 	movq	224+0-128(%rsi),%r8
    279 	shlb	$4,%dl
    280 	movq	%rax,96-128(%rbp)
    281 	movq	224+8-128(%rsi),%rax
    282 	shlq	$60,%r10
    283 	movb	%dl,13(%rsp)
    284 	orq	%r10,%rbx
    285 	movb	%al,%dl
    286 	shrq	$4,%rax
    287 	movq	%r8,%r10
    288 	shrq	$4,%r8
    289 	movq	%r9,104(%rbp)
    290 	movq	240+0-128(%rsi),%r9
    291 	shlb	$4,%dl
    292 	movq	%rbx,104-128(%rbp)
    293 	movq	240+8-128(%rsi),%rbx
    294 	shlq	$60,%r10
    295 	movb	%dl,14(%rsp)
    296 	orq	%r10,%rax
    297 	movb	%bl,%dl
    298 	shrq	$4,%rbx
    299 	movq	%r9,%r10
    300 	shrq	$4,%r9
    301 	movq	%r8,112(%rbp)
    302 	shlb	$4,%dl
    303 	movq	%rax,112-128(%rbp)
    304 	shlq	$60,%r10
    305 	movb	%dl,15(%rsp)
    306 	orq	%r10,%rbx
    307 	movq	%r9,120(%rbp)
    308 	movq	%rbx,120-128(%rbp)
    309 	addq	$-128,%rsi
    310 	movq	8(%rdi),%r8
    311 	movq	0(%rdi),%r9
    312 	addq	%r14,%r15
    313 	leaq	L$rem_8bit(%rip),%r11
    314 	jmp	L$outer_loop
    315 .p2align	4
    316 L$outer_loop:
    317 	xorq	(%r14),%r9
    318 	movq	8(%r14),%rdx
    319 	leaq	16(%r14),%r14
    320 	xorq	%r8,%rdx
    321 	movq	%r9,(%rdi)
    322 	movq	%rdx,8(%rdi)
    323 	shrq	$32,%rdx
    324 	xorq	%rax,%rax
    325 	roll	$8,%edx
    326 	movb	%dl,%al
    327 	movzbl	%dl,%ebx
    328 	shlb	$4,%al
    329 	shrl	$4,%ebx
    330 	roll	$8,%edx
    331 	movq	8(%rsi,%rax,1),%r8
    332 	movq	(%rsi,%rax,1),%r9
    333 	movb	%dl,%al
    334 	movzbl	%dl,%ecx
    335 	shlb	$4,%al
    336 	movzbq	(%rsp,%rbx,1),%r12
    337 	shrl	$4,%ecx
    338 	xorq	%r8,%r12
    339 	movq	%r9,%r10
    340 	shrq	$8,%r8
    341 	movzbq	%r12b,%r12
    342 	shrq	$8,%r9
    343 	xorq	-128(%rbp,%rbx,8),%r8
    344 	shlq	$56,%r10
    345 	xorq	(%rbp,%rbx,8),%r9
    346 	roll	$8,%edx
    347 	xorq	8(%rsi,%rax,1),%r8
    348 	xorq	(%rsi,%rax,1),%r9
    349 	movb	%dl,%al
    350 	xorq	%r10,%r8
    351 	movzwq	(%r11,%r12,2),%r12
    352 	movzbl	%dl,%ebx
    353 	shlb	$4,%al
    354 	movzbq	(%rsp,%rcx,1),%r13
    355 	shrl	$4,%ebx
    356 	shlq	$48,%r12
    357 	xorq	%r8,%r13
    358 	movq	%r9,%r10
    359 	xorq	%r12,%r9
    360 	shrq	$8,%r8
    361 	movzbq	%r13b,%r13
    362 	shrq	$8,%r9
    363 	xorq	-128(%rbp,%rcx,8),%r8
    364 	shlq	$56,%r10
    365 	xorq	(%rbp,%rcx,8),%r9
    366 	roll	$8,%edx
    367 	xorq	8(%rsi,%rax,1),%r8
    368 	xorq	(%rsi,%rax,1),%r9
    369 	movb	%dl,%al
    370 	xorq	%r10,%r8
    371 	movzwq	(%r11,%r13,2),%r13
    372 	movzbl	%dl,%ecx
    373 	shlb	$4,%al
    374 	movzbq	(%rsp,%rbx,1),%r12
    375 	shrl	$4,%ecx
    376 	shlq	$48,%r13
    377 	xorq	%r8,%r12
    378 	movq	%r9,%r10
    379 	xorq	%r13,%r9
    380 	shrq	$8,%r8
    381 	movzbq	%r12b,%r12
    382 	movl	8(%rdi),%edx
    383 	shrq	$8,%r9
    384 	xorq	-128(%rbp,%rbx,8),%r8
    385 	shlq	$56,%r10
    386 	xorq	(%rbp,%rbx,8),%r9
    387 	roll	$8,%edx
    388 	xorq	8(%rsi,%rax,1),%r8
    389 	xorq	(%rsi,%rax,1),%r9
    390 	movb	%dl,%al
    391 	xorq	%r10,%r8
    392 	movzwq	(%r11,%r12,2),%r12
    393 	movzbl	%dl,%ebx
    394 	shlb	$4,%al
    395 	movzbq	(%rsp,%rcx,1),%r13
    396 	shrl	$4,%ebx
    397 	shlq	$48,%r12
    398 	xorq	%r8,%r13
    399 	movq	%r9,%r10
    400 	xorq	%r12,%r9
    401 	shrq	$8,%r8
    402 	movzbq	%r13b,%r13
    403 	shrq	$8,%r9
    404 	xorq	-128(%rbp,%rcx,8),%r8
    405 	shlq	$56,%r10
    406 	xorq	(%rbp,%rcx,8),%r9
    407 	roll	$8,%edx
    408 	xorq	8(%rsi,%rax,1),%r8
    409 	xorq	(%rsi,%rax,1),%r9
    410 	movb	%dl,%al
    411 	xorq	%r10,%r8
    412 	movzwq	(%r11,%r13,2),%r13
    413 	movzbl	%dl,%ecx
    414 	shlb	$4,%al
    415 	movzbq	(%rsp,%rbx,1),%r12
    416 	shrl	$4,%ecx
    417 	shlq	$48,%r13
    418 	xorq	%r8,%r12
    419 	movq	%r9,%r10
    420 	xorq	%r13,%r9
    421 	shrq	$8,%r8
    422 	movzbq	%r12b,%r12
    423 	shrq	$8,%r9
    424 	xorq	-128(%rbp,%rbx,8),%r8
    425 	shlq	$56,%r10
    426 	xorq	(%rbp,%rbx,8),%r9
    427 	roll	$8,%edx
    428 	xorq	8(%rsi,%rax,1),%r8
    429 	xorq	(%rsi,%rax,1),%r9
    430 	movb	%dl,%al
    431 	xorq	%r10,%r8
    432 	movzwq	(%r11,%r12,2),%r12
    433 	movzbl	%dl,%ebx
    434 	shlb	$4,%al
    435 	movzbq	(%rsp,%rcx,1),%r13
    436 	shrl	$4,%ebx
    437 	shlq	$48,%r12
    438 	xorq	%r8,%r13
    439 	movq	%r9,%r10
    440 	xorq	%r12,%r9
    441 	shrq	$8,%r8
    442 	movzbq	%r13b,%r13
    443 	shrq	$8,%r9
    444 	xorq	-128(%rbp,%rcx,8),%r8
    445 	shlq	$56,%r10
    446 	xorq	(%rbp,%rcx,8),%r9
    447 	roll	$8,%edx
    448 	xorq	8(%rsi,%rax,1),%r8
    449 	xorq	(%rsi,%rax,1),%r9
    450 	movb	%dl,%al
    451 	xorq	%r10,%r8
    452 	movzwq	(%r11,%r13,2),%r13
    453 	movzbl	%dl,%ecx
    454 	shlb	$4,%al
    455 	movzbq	(%rsp,%rbx,1),%r12
    456 	shrl	$4,%ecx
    457 	shlq	$48,%r13
    458 	xorq	%r8,%r12
    459 	movq	%r9,%r10
    460 	xorq	%r13,%r9
    461 	shrq	$8,%r8
    462 	movzbq	%r12b,%r12
    463 	movl	4(%rdi),%edx
    464 	shrq	$8,%r9
    465 	xorq	-128(%rbp,%rbx,8),%r8
    466 	shlq	$56,%r10
    467 	xorq	(%rbp,%rbx,8),%r9
    468 	roll	$8,%edx
    469 	xorq	8(%rsi,%rax,1),%r8
    470 	xorq	(%rsi,%rax,1),%r9
    471 	movb	%dl,%al
    472 	xorq	%r10,%r8
    473 	movzwq	(%r11,%r12,2),%r12
    474 	movzbl	%dl,%ebx
    475 	shlb	$4,%al
    476 	movzbq	(%rsp,%rcx,1),%r13
    477 	shrl	$4,%ebx
    478 	shlq	$48,%r12
    479 	xorq	%r8,%r13
    480 	movq	%r9,%r10
    481 	xorq	%r12,%r9
    482 	shrq	$8,%r8
    483 	movzbq	%r13b,%r13
    484 	shrq	$8,%r9
    485 	xorq	-128(%rbp,%rcx,8),%r8
    486 	shlq	$56,%r10
    487 	xorq	(%rbp,%rcx,8),%r9
    488 	roll	$8,%edx
    489 	xorq	8(%rsi,%rax,1),%r8
    490 	xorq	(%rsi,%rax,1),%r9
    491 	movb	%dl,%al
    492 	xorq	%r10,%r8
    493 	movzwq	(%r11,%r13,2),%r13
    494 	movzbl	%dl,%ecx
    495 	shlb	$4,%al
    496 	movzbq	(%rsp,%rbx,1),%r12
    497 	shrl	$4,%ecx
    498 	shlq	$48,%r13
    499 	xorq	%r8,%r12
    500 	movq	%r9,%r10
    501 	xorq	%r13,%r9
    502 	shrq	$8,%r8
    503 	movzbq	%r12b,%r12
    504 	shrq	$8,%r9
    505 	xorq	-128(%rbp,%rbx,8),%r8
    506 	shlq	$56,%r10
    507 	xorq	(%rbp,%rbx,8),%r9
    508 	roll	$8,%edx
    509 	xorq	8(%rsi,%rax,1),%r8
    510 	xorq	(%rsi,%rax,1),%r9
    511 	movb	%dl,%al
    512 	xorq	%r10,%r8
    513 	movzwq	(%r11,%r12,2),%r12
    514 	movzbl	%dl,%ebx
    515 	shlb	$4,%al
    516 	movzbq	(%rsp,%rcx,1),%r13
    517 	shrl	$4,%ebx
    518 	shlq	$48,%r12
    519 	xorq	%r8,%r13
    520 	movq	%r9,%r10
    521 	xorq	%r12,%r9
    522 	shrq	$8,%r8
    523 	movzbq	%r13b,%r13
    524 	shrq	$8,%r9
    525 	xorq	-128(%rbp,%rcx,8),%r8
    526 	shlq	$56,%r10
    527 	xorq	(%rbp,%rcx,8),%r9
    528 	roll	$8,%edx
    529 	xorq	8(%rsi,%rax,1),%r8
    530 	xorq	(%rsi,%rax,1),%r9
    531 	movb	%dl,%al
    532 	xorq	%r10,%r8
    533 	movzwq	(%r11,%r13,2),%r13
    534 	movzbl	%dl,%ecx
    535 	shlb	$4,%al
    536 	movzbq	(%rsp,%rbx,1),%r12
    537 	shrl	$4,%ecx
    538 	shlq	$48,%r13
    539 	xorq	%r8,%r12
    540 	movq	%r9,%r10
    541 	xorq	%r13,%r9
    542 	shrq	$8,%r8
    543 	movzbq	%r12b,%r12
    544 	movl	0(%rdi),%edx
    545 	shrq	$8,%r9
    546 	xorq	-128(%rbp,%rbx,8),%r8
    547 	shlq	$56,%r10
    548 	xorq	(%rbp,%rbx,8),%r9
    549 	roll	$8,%edx
    550 	xorq	8(%rsi,%rax,1),%r8
    551 	xorq	(%rsi,%rax,1),%r9
    552 	movb	%dl,%al
    553 	xorq	%r10,%r8
    554 	movzwq	(%r11,%r12,2),%r12
    555 	movzbl	%dl,%ebx
    556 	shlb	$4,%al
    557 	movzbq	(%rsp,%rcx,1),%r13
    558 	shrl	$4,%ebx
    559 	shlq	$48,%r12
    560 	xorq	%r8,%r13
    561 	movq	%r9,%r10
    562 	xorq	%r12,%r9
    563 	shrq	$8,%r8
    564 	movzbq	%r13b,%r13
    565 	shrq	$8,%r9
    566 	xorq	-128(%rbp,%rcx,8),%r8
    567 	shlq	$56,%r10
    568 	xorq	(%rbp,%rcx,8),%r9
    569 	roll	$8,%edx
    570 	xorq	8(%rsi,%rax,1),%r8
    571 	xorq	(%rsi,%rax,1),%r9
    572 	movb	%dl,%al
    573 	xorq	%r10,%r8
    574 	movzwq	(%r11,%r13,2),%r13
    575 	movzbl	%dl,%ecx
    576 	shlb	$4,%al
    577 	movzbq	(%rsp,%rbx,1),%r12
    578 	shrl	$4,%ecx
    579 	shlq	$48,%r13
    580 	xorq	%r8,%r12
    581 	movq	%r9,%r10
    582 	xorq	%r13,%r9
    583 	shrq	$8,%r8
    584 	movzbq	%r12b,%r12
    585 	shrq	$8,%r9
    586 	xorq	-128(%rbp,%rbx,8),%r8
    587 	shlq	$56,%r10
    588 	xorq	(%rbp,%rbx,8),%r9
    589 	roll	$8,%edx
    590 	xorq	8(%rsi,%rax,1),%r8
    591 	xorq	(%rsi,%rax,1),%r9
    592 	movb	%dl,%al
    593 	xorq	%r10,%r8
    594 	movzwq	(%r11,%r12,2),%r12
    595 	movzbl	%dl,%ebx
    596 	shlb	$4,%al
    597 	movzbq	(%rsp,%rcx,1),%r13
    598 	shrl	$4,%ebx
    599 	shlq	$48,%r12
    600 	xorq	%r8,%r13
    601 	movq	%r9,%r10
    602 	xorq	%r12,%r9
    603 	shrq	$8,%r8
    604 	movzbq	%r13b,%r13
    605 	shrq	$8,%r9
    606 	xorq	-128(%rbp,%rcx,8),%r8
    607 	shlq	$56,%r10
    608 	xorq	(%rbp,%rcx,8),%r9
    609 	roll	$8,%edx
    610 	xorq	8(%rsi,%rax,1),%r8
    611 	xorq	(%rsi,%rax,1),%r9
    612 	movb	%dl,%al
    613 	xorq	%r10,%r8
    614 	movzwq	(%r11,%r13,2),%r13
    615 	movzbl	%dl,%ecx
    616 	shlb	$4,%al
    617 	movzbq	(%rsp,%rbx,1),%r12
    618 	andl	$240,%ecx
    619 	shlq	$48,%r13
    620 	xorq	%r8,%r12
    621 	movq	%r9,%r10
    622 	xorq	%r13,%r9
    623 	shrq	$8,%r8
    624 	movzbq	%r12b,%r12
    625 	movl	-4(%rdi),%edx
    626 	shrq	$8,%r9
    627 	xorq	-128(%rbp,%rbx,8),%r8
    628 	shlq	$56,%r10
    629 	xorq	(%rbp,%rbx,8),%r9
    630 	movzwq	(%r11,%r12,2),%r12
    631 	xorq	8(%rsi,%rax,1),%r8
    632 	xorq	(%rsi,%rax,1),%r9
    633 	shlq	$48,%r12
    634 	xorq	%r10,%r8
    635 	xorq	%r12,%r9
    636 	movzbq	%r8b,%r13
    637 	shrq	$4,%r8
    638 	movq	%r9,%r10
    639 	shlb	$4,%r13b
    640 	shrq	$4,%r9
    641 	xorq	8(%rsi,%rcx,1),%r8
    642 	movzwq	(%r11,%r13,2),%r13
    643 	shlq	$60,%r10
    644 	xorq	(%rsi,%rcx,1),%r9
    645 	xorq	%r10,%r8
    646 	shlq	$48,%r13
    647 	bswapq	%r8
    648 	xorq	%r13,%r9
    649 	bswapq	%r9
    650 	cmpq	%r15,%r14
    651 	jb	L$outer_loop
    652 	movq	%r8,8(%rdi)
    653 	movq	%r9,(%rdi)
    654 
    655 	leaq	280+48(%rsp),%rsi
    656 	movq	-48(%rsi),%r15
    657 	movq	-40(%rsi),%r14
    658 	movq	-32(%rsi),%r13
    659 	movq	-24(%rsi),%r12
    660 	movq	-16(%rsi),%rbp
    661 	movq	-8(%rsi),%rbx
    662 	leaq	0(%rsi),%rsp
    663 L$ghash_epilogue:
    664 	.byte	0xf3,0xc3
    665 
    666 .globl	_gcm_init_clmul
    667 .private_extern _gcm_init_clmul
    668 
    669 .p2align	4
    670 _gcm_init_clmul:
    671 L$_init_clmul:
    672 	movdqu	(%rsi),%xmm2
    673 	pshufd	$78,%xmm2,%xmm2
    674 
    675 
    676 	pshufd	$255,%xmm2,%xmm4
    677 	movdqa	%xmm2,%xmm3
    678 	psllq	$1,%xmm2
    679 	pxor	%xmm5,%xmm5
    680 	psrlq	$63,%xmm3
    681 	pcmpgtd	%xmm4,%xmm5
    682 	pslldq	$8,%xmm3
    683 	por	%xmm3,%xmm2
    684 
    685 
    686 	pand	L$0x1c2_polynomial(%rip),%xmm5
    687 	pxor	%xmm5,%xmm2
    688 
    689 
    690 	pshufd	$78,%xmm2,%xmm6
    691 	movdqa	%xmm2,%xmm0
    692 	pxor	%xmm2,%xmm6
    693 	movdqa	%xmm0,%xmm1
    694 	pshufd	$78,%xmm0,%xmm3
    695 	pxor	%xmm0,%xmm3
    696 .byte	102,15,58,68,194,0
    697 .byte	102,15,58,68,202,17
    698 .byte	102,15,58,68,222,0
    699 	pxor	%xmm0,%xmm3
    700 	pxor	%xmm1,%xmm3
    701 
    702 	movdqa	%xmm3,%xmm4
    703 	psrldq	$8,%xmm3
    704 	pslldq	$8,%xmm4
    705 	pxor	%xmm3,%xmm1
    706 	pxor	%xmm4,%xmm0
    707 
    708 	movdqa	%xmm0,%xmm4
    709 	movdqa	%xmm0,%xmm3
    710 	psllq	$5,%xmm0
    711 	pxor	%xmm0,%xmm3
    712 	psllq	$1,%xmm0
    713 	pxor	%xmm3,%xmm0
    714 	psllq	$57,%xmm0
    715 	movdqa	%xmm0,%xmm3
    716 	pslldq	$8,%xmm0
    717 	psrldq	$8,%xmm3
    718 	pxor	%xmm4,%xmm0
    719 	pxor	%xmm3,%xmm1
    720 
    721 
    722 	movdqa	%xmm0,%xmm4
    723 	psrlq	$1,%xmm0
    724 	pxor	%xmm4,%xmm1
    725 	pxor	%xmm0,%xmm4
    726 	psrlq	$5,%xmm0
    727 	pxor	%xmm4,%xmm0
    728 	psrlq	$1,%xmm0
    729 	pxor	%xmm1,%xmm0
    730 	pshufd	$78,%xmm2,%xmm3
    731 	pshufd	$78,%xmm0,%xmm4
    732 	pxor	%xmm2,%xmm3
    733 	movdqu	%xmm2,0(%rdi)
    734 	pxor	%xmm0,%xmm4
    735 	movdqu	%xmm0,16(%rdi)
    736 .byte	102,15,58,15,227,8
    737 	movdqu	%xmm4,32(%rdi)
    738 	movdqa	%xmm0,%xmm1
    739 	pshufd	$78,%xmm0,%xmm3
    740 	pxor	%xmm0,%xmm3
    741 .byte	102,15,58,68,194,0
    742 .byte	102,15,58,68,202,17
    743 .byte	102,15,58,68,222,0
    744 	pxor	%xmm0,%xmm3
    745 	pxor	%xmm1,%xmm3
    746 
    747 	movdqa	%xmm3,%xmm4
    748 	psrldq	$8,%xmm3
    749 	pslldq	$8,%xmm4
    750 	pxor	%xmm3,%xmm1
    751 	pxor	%xmm4,%xmm0
    752 
    753 	movdqa	%xmm0,%xmm4
    754 	movdqa	%xmm0,%xmm3
    755 	psllq	$5,%xmm0
    756 	pxor	%xmm0,%xmm3
    757 	psllq	$1,%xmm0
    758 	pxor	%xmm3,%xmm0
    759 	psllq	$57,%xmm0
    760 	movdqa	%xmm0,%xmm3
    761 	pslldq	$8,%xmm0
    762 	psrldq	$8,%xmm3
    763 	pxor	%xmm4,%xmm0
    764 	pxor	%xmm3,%xmm1
    765 
    766 
    767 	movdqa	%xmm0,%xmm4
    768 	psrlq	$1,%xmm0
    769 	pxor	%xmm4,%xmm1
    770 	pxor	%xmm0,%xmm4
    771 	psrlq	$5,%xmm0
    772 	pxor	%xmm4,%xmm0
    773 	psrlq	$1,%xmm0
    774 	pxor	%xmm1,%xmm0
    775 	movdqa	%xmm0,%xmm5
    776 	movdqa	%xmm0,%xmm1
    777 	pshufd	$78,%xmm0,%xmm3
    778 	pxor	%xmm0,%xmm3
    779 .byte	102,15,58,68,194,0
    780 .byte	102,15,58,68,202,17
    781 .byte	102,15,58,68,222,0
    782 	pxor	%xmm0,%xmm3
    783 	pxor	%xmm1,%xmm3
    784 
    785 	movdqa	%xmm3,%xmm4
    786 	psrldq	$8,%xmm3
    787 	pslldq	$8,%xmm4
    788 	pxor	%xmm3,%xmm1
    789 	pxor	%xmm4,%xmm0
    790 
    791 	movdqa	%xmm0,%xmm4
    792 	movdqa	%xmm0,%xmm3
    793 	psllq	$5,%xmm0
    794 	pxor	%xmm0,%xmm3
    795 	psllq	$1,%xmm0
    796 	pxor	%xmm3,%xmm0
    797 	psllq	$57,%xmm0
    798 	movdqa	%xmm0,%xmm3
    799 	pslldq	$8,%xmm0
    800 	psrldq	$8,%xmm3
    801 	pxor	%xmm4,%xmm0
    802 	pxor	%xmm3,%xmm1
    803 
    804 
    805 	movdqa	%xmm0,%xmm4
    806 	psrlq	$1,%xmm0
    807 	pxor	%xmm4,%xmm1
    808 	pxor	%xmm0,%xmm4
    809 	psrlq	$5,%xmm0
    810 	pxor	%xmm4,%xmm0
    811 	psrlq	$1,%xmm0
    812 	pxor	%xmm1,%xmm0
    813 	pshufd	$78,%xmm5,%xmm3
    814 	pshufd	$78,%xmm0,%xmm4
    815 	pxor	%xmm5,%xmm3
    816 	movdqu	%xmm5,48(%rdi)
    817 	pxor	%xmm0,%xmm4
    818 	movdqu	%xmm0,64(%rdi)
    819 .byte	102,15,58,15,227,8
    820 	movdqu	%xmm4,80(%rdi)
    821 	.byte	0xf3,0xc3
    822 
    823 .globl	_gcm_gmult_clmul
    824 .private_extern _gcm_gmult_clmul
    825 
    826 .p2align	4
    827 _gcm_gmult_clmul:
    828 L$_gmult_clmul:
    829 	movdqu	(%rdi),%xmm0
    830 	movdqa	L$bswap_mask(%rip),%xmm5
    831 	movdqu	(%rsi),%xmm2
    832 	movdqu	32(%rsi),%xmm4
    833 .byte	102,15,56,0,197
    834 	movdqa	%xmm0,%xmm1
    835 	pshufd	$78,%xmm0,%xmm3
    836 	pxor	%xmm0,%xmm3
    837 .byte	102,15,58,68,194,0
    838 .byte	102,15,58,68,202,17
    839 .byte	102,15,58,68,220,0
    840 	pxor	%xmm0,%xmm3
    841 	pxor	%xmm1,%xmm3
    842 
    843 	movdqa	%xmm3,%xmm4
    844 	psrldq	$8,%xmm3
    845 	pslldq	$8,%xmm4
    846 	pxor	%xmm3,%xmm1
    847 	pxor	%xmm4,%xmm0
    848 
    849 	movdqa	%xmm0,%xmm4
    850 	movdqa	%xmm0,%xmm3
    851 	psllq	$5,%xmm0
    852 	pxor	%xmm0,%xmm3
    853 	psllq	$1,%xmm0
    854 	pxor	%xmm3,%xmm0
    855 	psllq	$57,%xmm0
    856 	movdqa	%xmm0,%xmm3
    857 	pslldq	$8,%xmm0
    858 	psrldq	$8,%xmm3
    859 	pxor	%xmm4,%xmm0
    860 	pxor	%xmm3,%xmm1
    861 
    862 
    863 	movdqa	%xmm0,%xmm4
    864 	psrlq	$1,%xmm0
    865 	pxor	%xmm4,%xmm1
    866 	pxor	%xmm0,%xmm4
    867 	psrlq	$5,%xmm0
    868 	pxor	%xmm4,%xmm0
    869 	psrlq	$1,%xmm0
    870 	pxor	%xmm1,%xmm0
    871 .byte	102,15,56,0,197
    872 	movdqu	%xmm0,(%rdi)
    873 	.byte	0xf3,0xc3
    874 
    875 .globl	_gcm_ghash_clmul
    876 .private_extern _gcm_ghash_clmul
    877 
    878 .p2align	5
    879 _gcm_ghash_clmul:
    880 L$_ghash_clmul:
    881 	movdqa	L$bswap_mask(%rip),%xmm10
    882 
    883 	movdqu	(%rdi),%xmm0
    884 	movdqu	(%rsi),%xmm2
    885 	movdqu	32(%rsi),%xmm7
    886 .byte	102,65,15,56,0,194
    887 
    888 	subq	$0x10,%rcx
    889 	jz	L$odd_tail
    890 
    891 	movdqu	16(%rsi),%xmm6
    892 	leaq	_OPENSSL_ia32cap_P(%rip),%rax
    893 	movl	4(%rax),%eax
    894 	cmpq	$0x30,%rcx
    895 	jb	L$skip4x
    896 
    897 	andl	$71303168,%eax
    898 	cmpl	$4194304,%eax
    899 	je	L$skip4x
    900 
    901 	subq	$0x30,%rcx
    902 	movq	$0xA040608020C0E000,%rax
    903 	movdqu	48(%rsi),%xmm14
    904 	movdqu	64(%rsi),%xmm15
    905 
    906 
    907 
    908 
    909 	movdqu	48(%rdx),%xmm3
    910 	movdqu	32(%rdx),%xmm11
    911 .byte	102,65,15,56,0,218
    912 .byte	102,69,15,56,0,218
    913 	movdqa	%xmm3,%xmm5
    914 	pshufd	$78,%xmm3,%xmm4
    915 	pxor	%xmm3,%xmm4
    916 .byte	102,15,58,68,218,0
    917 .byte	102,15,58,68,234,17
    918 .byte	102,15,58,68,231,0
    919 
    920 	movdqa	%xmm11,%xmm13
    921 	pshufd	$78,%xmm11,%xmm12
    922 	pxor	%xmm11,%xmm12
    923 .byte	102,68,15,58,68,222,0
    924 .byte	102,68,15,58,68,238,17
    925 .byte	102,68,15,58,68,231,16
    926 	xorps	%xmm11,%xmm3
    927 	xorps	%xmm13,%xmm5
    928 	movups	80(%rsi),%xmm7
    929 	xorps	%xmm12,%xmm4
    930 
    931 	movdqu	16(%rdx),%xmm11
    932 	movdqu	0(%rdx),%xmm8
    933 .byte	102,69,15,56,0,218
    934 .byte	102,69,15,56,0,194
    935 	movdqa	%xmm11,%xmm13
    936 	pshufd	$78,%xmm11,%xmm12
    937 	pxor	%xmm8,%xmm0
    938 	pxor	%xmm11,%xmm12
    939 .byte	102,69,15,58,68,222,0
    940 	movdqa	%xmm0,%xmm1
    941 	pshufd	$78,%xmm0,%xmm8
    942 	pxor	%xmm0,%xmm8
    943 .byte	102,69,15,58,68,238,17
    944 .byte	102,68,15,58,68,231,0
    945 	xorps	%xmm11,%xmm3
    946 	xorps	%xmm13,%xmm5
    947 
    948 	leaq	64(%rdx),%rdx
    949 	subq	$0x40,%rcx
    950 	jc	L$tail4x
    951 
    952 	jmp	L$mod4_loop
    953 .p2align	5
    954 L$mod4_loop:
    955 .byte	102,65,15,58,68,199,0
    956 	xorps	%xmm12,%xmm4
    957 	movdqu	48(%rdx),%xmm11
    958 .byte	102,69,15,56,0,218
    959 .byte	102,65,15,58,68,207,17
    960 	xorps	%xmm3,%xmm0
    961 	movdqu	32(%rdx),%xmm3
    962 	movdqa	%xmm11,%xmm13
    963 .byte	102,68,15,58,68,199,16
    964 	pshufd	$78,%xmm11,%xmm12
    965 	xorps	%xmm5,%xmm1
    966 	pxor	%xmm11,%xmm12
    967 .byte	102,65,15,56,0,218
    968 	movups	32(%rsi),%xmm7
    969 	xorps	%xmm4,%xmm8
    970 .byte	102,68,15,58,68,218,0
    971 	pshufd	$78,%xmm3,%xmm4
    972 
    973 	pxor	%xmm0,%xmm8
    974 	movdqa	%xmm3,%xmm5
    975 	pxor	%xmm1,%xmm8
    976 	pxor	%xmm3,%xmm4
    977 	movdqa	%xmm8,%xmm9
    978 .byte	102,68,15,58,68,234,17
    979 	pslldq	$8,%xmm8
    980 	psrldq	$8,%xmm9
    981 	pxor	%xmm8,%xmm0
    982 	movdqa	L$7_mask(%rip),%xmm8
    983 	pxor	%xmm9,%xmm1
    984 .byte	102,76,15,110,200
    985 
    986 	pand	%xmm0,%xmm8
    987 .byte	102,69,15,56,0,200
    988 	pxor	%xmm0,%xmm9
    989 .byte	102,68,15,58,68,231,0
    990 	psllq	$57,%xmm9
    991 	movdqa	%xmm9,%xmm8
    992 	pslldq	$8,%xmm9
    993 .byte	102,15,58,68,222,0
    994 	psrldq	$8,%xmm8
    995 	pxor	%xmm9,%xmm0
    996 	pxor	%xmm8,%xmm1
    997 	movdqu	0(%rdx),%xmm8
    998 
    999 	movdqa	%xmm0,%xmm9
   1000 	psrlq	$1,%xmm0
   1001 .byte	102,15,58,68,238,17
   1002 	xorps	%xmm11,%xmm3
   1003 	movdqu	16(%rdx),%xmm11
   1004 .byte	102,69,15,56,0,218
   1005 .byte	102,15,58,68,231,16
   1006 	xorps	%xmm13,%xmm5
   1007 	movups	80(%rsi),%xmm7
   1008 .byte	102,69,15,56,0,194
   1009 	pxor	%xmm9,%xmm1
   1010 	pxor	%xmm0,%xmm9
   1011 	psrlq	$5,%xmm0
   1012 
   1013 	movdqa	%xmm11,%xmm13
   1014 	pxor	%xmm12,%xmm4
   1015 	pshufd	$78,%xmm11,%xmm12
   1016 	pxor	%xmm9,%xmm0
   1017 	pxor	%xmm8,%xmm1
   1018 	pxor	%xmm11,%xmm12
   1019 .byte	102,69,15,58,68,222,0
   1020 	psrlq	$1,%xmm0
   1021 	pxor	%xmm1,%xmm0
   1022 	movdqa	%xmm0,%xmm1
   1023 .byte	102,69,15,58,68,238,17
   1024 	xorps	%xmm11,%xmm3
   1025 	pshufd	$78,%xmm0,%xmm8
   1026 	pxor	%xmm0,%xmm8
   1027 
   1028 .byte	102,68,15,58,68,231,0
   1029 	xorps	%xmm13,%xmm5
   1030 
   1031 	leaq	64(%rdx),%rdx
   1032 	subq	$0x40,%rcx
   1033 	jnc	L$mod4_loop
   1034 
   1035 L$tail4x:
   1036 .byte	102,65,15,58,68,199,0
   1037 .byte	102,65,15,58,68,207,17
   1038 .byte	102,68,15,58,68,199,16
   1039 	xorps	%xmm12,%xmm4
   1040 	xorps	%xmm3,%xmm0
   1041 	xorps	%xmm5,%xmm1
   1042 	pxor	%xmm0,%xmm1
   1043 	pxor	%xmm4,%xmm8
   1044 
   1045 	pxor	%xmm1,%xmm8
   1046 	pxor	%xmm0,%xmm1
   1047 
   1048 	movdqa	%xmm8,%xmm9
   1049 	psrldq	$8,%xmm8
   1050 	pslldq	$8,%xmm9
   1051 	pxor	%xmm8,%xmm1
   1052 	pxor	%xmm9,%xmm0
   1053 
   1054 	movdqa	%xmm0,%xmm4
   1055 	movdqa	%xmm0,%xmm3
   1056 	psllq	$5,%xmm0
   1057 	pxor	%xmm0,%xmm3
   1058 	psllq	$1,%xmm0
   1059 	pxor	%xmm3,%xmm0
   1060 	psllq	$57,%xmm0
   1061 	movdqa	%xmm0,%xmm3
   1062 	pslldq	$8,%xmm0
   1063 	psrldq	$8,%xmm3
   1064 	pxor	%xmm4,%xmm0
   1065 	pxor	%xmm3,%xmm1
   1066 
   1067 
   1068 	movdqa	%xmm0,%xmm4
   1069 	psrlq	$1,%xmm0
   1070 	pxor	%xmm4,%xmm1
   1071 	pxor	%xmm0,%xmm4
   1072 	psrlq	$5,%xmm0
   1073 	pxor	%xmm4,%xmm0
   1074 	psrlq	$1,%xmm0
   1075 	pxor	%xmm1,%xmm0
   1076 	addq	$0x40,%rcx
   1077 	jz	L$done
   1078 	movdqu	32(%rsi),%xmm7
   1079 	subq	$0x10,%rcx
   1080 	jz	L$odd_tail
   1081 L$skip4x:
   1082 
   1083 
   1084 
   1085 
   1086 
   1087 	movdqu	(%rdx),%xmm8
   1088 	movdqu	16(%rdx),%xmm3
   1089 .byte	102,69,15,56,0,194
   1090 .byte	102,65,15,56,0,218
   1091 	pxor	%xmm8,%xmm0
   1092 
   1093 	movdqa	%xmm3,%xmm5
   1094 	pshufd	$78,%xmm3,%xmm4
   1095 	pxor	%xmm3,%xmm4
   1096 .byte	102,15,58,68,218,0
   1097 .byte	102,15,58,68,234,17
   1098 .byte	102,15,58,68,231,0
   1099 
   1100 	leaq	32(%rdx),%rdx
   1101 	nop
   1102 	subq	$0x20,%rcx
   1103 	jbe	L$even_tail
   1104 	nop
   1105 	jmp	L$mod_loop
   1106 
   1107 .p2align	5
   1108 L$mod_loop:
   1109 	movdqa	%xmm0,%xmm1
   1110 	movdqa	%xmm4,%xmm8
   1111 	pshufd	$78,%xmm0,%xmm4
   1112 	pxor	%xmm0,%xmm4
   1113 
   1114 .byte	102,15,58,68,198,0
   1115 .byte	102,15,58,68,206,17
   1116 .byte	102,15,58,68,231,16
   1117 
   1118 	pxor	%xmm3,%xmm0
   1119 	pxor	%xmm5,%xmm1
   1120 	movdqu	(%rdx),%xmm9
   1121 	pxor	%xmm0,%xmm8
   1122 .byte	102,69,15,56,0,202
   1123 	movdqu	16(%rdx),%xmm3
   1124 
   1125 	pxor	%xmm1,%xmm8
   1126 	pxor	%xmm9,%xmm1
   1127 	pxor	%xmm8,%xmm4
   1128 .byte	102,65,15,56,0,218
   1129 	movdqa	%xmm4,%xmm8
   1130 	psrldq	$8,%xmm8
   1131 	pslldq	$8,%xmm4
   1132 	pxor	%xmm8,%xmm1
   1133 	pxor	%xmm4,%xmm0
   1134 
   1135 	movdqa	%xmm3,%xmm5
   1136 
   1137 	movdqa	%xmm0,%xmm9
   1138 	movdqa	%xmm0,%xmm8
   1139 	psllq	$5,%xmm0
   1140 	pxor	%xmm0,%xmm8
   1141 .byte	102,15,58,68,218,0
   1142 	psllq	$1,%xmm0
   1143 	pxor	%xmm8,%xmm0
   1144 	psllq	$57,%xmm0
   1145 	movdqa	%xmm0,%xmm8
   1146 	pslldq	$8,%xmm0
   1147 	psrldq	$8,%xmm8
   1148 	pxor	%xmm9,%xmm0
   1149 	pshufd	$78,%xmm5,%xmm4
   1150 	pxor	%xmm8,%xmm1
   1151 	pxor	%xmm5,%xmm4
   1152 
   1153 	movdqa	%xmm0,%xmm9
   1154 	psrlq	$1,%xmm0
   1155 .byte	102,15,58,68,234,17
   1156 	pxor	%xmm9,%xmm1
   1157 	pxor	%xmm0,%xmm9
   1158 	psrlq	$5,%xmm0
   1159 	pxor	%xmm9,%xmm0
   1160 	leaq	32(%rdx),%rdx
   1161 	psrlq	$1,%xmm0
   1162 .byte	102,15,58,68,231,0
   1163 	pxor	%xmm1,%xmm0
   1164 
   1165 	subq	$0x20,%rcx
   1166 	ja	L$mod_loop
   1167 
   1168 L$even_tail:
   1169 	movdqa	%xmm0,%xmm1
   1170 	movdqa	%xmm4,%xmm8
   1171 	pshufd	$78,%xmm0,%xmm4
   1172 	pxor	%xmm0,%xmm4
   1173 
   1174 .byte	102,15,58,68,198,0
   1175 .byte	102,15,58,68,206,17
   1176 .byte	102,15,58,68,231,16
   1177 
   1178 	pxor	%xmm3,%xmm0
   1179 	pxor	%xmm5,%xmm1
   1180 	pxor	%xmm0,%xmm8
   1181 	pxor	%xmm1,%xmm8
   1182 	pxor	%xmm8,%xmm4
   1183 	movdqa	%xmm4,%xmm8
   1184 	psrldq	$8,%xmm8
   1185 	pslldq	$8,%xmm4
   1186 	pxor	%xmm8,%xmm1
   1187 	pxor	%xmm4,%xmm0
   1188 
   1189 	movdqa	%xmm0,%xmm4
   1190 	movdqa	%xmm0,%xmm3
   1191 	psllq	$5,%xmm0
   1192 	pxor	%xmm0,%xmm3
   1193 	psllq	$1,%xmm0
   1194 	pxor	%xmm3,%xmm0
   1195 	psllq	$57,%xmm0
   1196 	movdqa	%xmm0,%xmm3
   1197 	pslldq	$8,%xmm0
   1198 	psrldq	$8,%xmm3
   1199 	pxor	%xmm4,%xmm0
   1200 	pxor	%xmm3,%xmm1
   1201 
   1202 
   1203 	movdqa	%xmm0,%xmm4
   1204 	psrlq	$1,%xmm0
   1205 	pxor	%xmm4,%xmm1
   1206 	pxor	%xmm0,%xmm4
   1207 	psrlq	$5,%xmm0
   1208 	pxor	%xmm4,%xmm0
   1209 	psrlq	$1,%xmm0
   1210 	pxor	%xmm1,%xmm0
   1211 	testq	%rcx,%rcx
   1212 	jnz	L$done
   1213 
   1214 L$odd_tail:
   1215 	movdqu	(%rdx),%xmm8
   1216 .byte	102,69,15,56,0,194
   1217 	pxor	%xmm8,%xmm0
   1218 	movdqa	%xmm0,%xmm1
   1219 	pshufd	$78,%xmm0,%xmm3
   1220 	pxor	%xmm0,%xmm3
   1221 .byte	102,15,58,68,194,0
   1222 .byte	102,15,58,68,202,17
   1223 .byte	102,15,58,68,223,0
   1224 	pxor	%xmm0,%xmm3
   1225 	pxor	%xmm1,%xmm3
   1226 
   1227 	movdqa	%xmm3,%xmm4
   1228 	psrldq	$8,%xmm3
   1229 	pslldq	$8,%xmm4
   1230 	pxor	%xmm3,%xmm1
   1231 	pxor	%xmm4,%xmm0
   1232 
   1233 	movdqa	%xmm0,%xmm4
   1234 	movdqa	%xmm0,%xmm3
   1235 	psllq	$5,%xmm0
   1236 	pxor	%xmm0,%xmm3
   1237 	psllq	$1,%xmm0
   1238 	pxor	%xmm3,%xmm0
   1239 	psllq	$57,%xmm0
   1240 	movdqa	%xmm0,%xmm3
   1241 	pslldq	$8,%xmm0
   1242 	psrldq	$8,%xmm3
   1243 	pxor	%xmm4,%xmm0
   1244 	pxor	%xmm3,%xmm1
   1245 
   1246 
   1247 	movdqa	%xmm0,%xmm4
   1248 	psrlq	$1,%xmm0
   1249 	pxor	%xmm4,%xmm1
   1250 	pxor	%xmm0,%xmm4
   1251 	psrlq	$5,%xmm0
   1252 	pxor	%xmm4,%xmm0
   1253 	psrlq	$1,%xmm0
   1254 	pxor	%xmm1,%xmm0
   1255 L$done:
   1256 .byte	102,65,15,56,0,194
   1257 	movdqu	%xmm0,(%rdi)
   1258 	.byte	0xf3,0xc3
   1259 
   1260 .globl	_gcm_init_avx
   1261 .private_extern _gcm_init_avx
   1262 
   1263 .p2align	5
   1264 _gcm_init_avx:
   1265 	vzeroupper
   1266 
   1267 	vmovdqu	(%rsi),%xmm2
   1268 	vpshufd	$78,%xmm2,%xmm2
   1269 
   1270 
   1271 	vpshufd	$255,%xmm2,%xmm4
   1272 	vpsrlq	$63,%xmm2,%xmm3
   1273 	vpsllq	$1,%xmm2,%xmm2
   1274 	vpxor	%xmm5,%xmm5,%xmm5
   1275 	vpcmpgtd	%xmm4,%xmm5,%xmm5
   1276 	vpslldq	$8,%xmm3,%xmm3
   1277 	vpor	%xmm3,%xmm2,%xmm2
   1278 
   1279 
   1280 	vpand	L$0x1c2_polynomial(%rip),%xmm5,%xmm5
   1281 	vpxor	%xmm5,%xmm2,%xmm2
   1282 
   1283 	vpunpckhqdq	%xmm2,%xmm2,%xmm6
   1284 	vmovdqa	%xmm2,%xmm0
   1285 	vpxor	%xmm2,%xmm6,%xmm6
   1286 	movq	$4,%r10
   1287 	jmp	L$init_start_avx
   1288 .p2align	5
   1289 L$init_loop_avx:
   1290 	vpalignr	$8,%xmm3,%xmm4,%xmm5
   1291 	vmovdqu	%xmm5,-16(%rdi)
   1292 	vpunpckhqdq	%xmm0,%xmm0,%xmm3
   1293 	vpxor	%xmm0,%xmm3,%xmm3
   1294 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
   1295 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
   1296 	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
   1297 	vpxor	%xmm0,%xmm1,%xmm4
   1298 	vpxor	%xmm4,%xmm3,%xmm3
   1299 
   1300 	vpslldq	$8,%xmm3,%xmm4
   1301 	vpsrldq	$8,%xmm3,%xmm3
   1302 	vpxor	%xmm4,%xmm0,%xmm0
   1303 	vpxor	%xmm3,%xmm1,%xmm1
   1304 	vpsllq	$57,%xmm0,%xmm3
   1305 	vpsllq	$62,%xmm0,%xmm4
   1306 	vpxor	%xmm3,%xmm4,%xmm4
   1307 	vpsllq	$63,%xmm0,%xmm3
   1308 	vpxor	%xmm3,%xmm4,%xmm4
   1309 	vpslldq	$8,%xmm4,%xmm3
   1310 	vpsrldq	$8,%xmm4,%xmm4
   1311 	vpxor	%xmm3,%xmm0,%xmm0
   1312 	vpxor	%xmm4,%xmm1,%xmm1
   1313 
   1314 	vpsrlq	$1,%xmm0,%xmm4
   1315 	vpxor	%xmm0,%xmm1,%xmm1
   1316 	vpxor	%xmm4,%xmm0,%xmm0
   1317 	vpsrlq	$5,%xmm4,%xmm4
   1318 	vpxor	%xmm4,%xmm0,%xmm0
   1319 	vpsrlq	$1,%xmm0,%xmm0
   1320 	vpxor	%xmm1,%xmm0,%xmm0
   1321 L$init_start_avx:
   1322 	vmovdqa	%xmm0,%xmm5
   1323 	vpunpckhqdq	%xmm0,%xmm0,%xmm3
   1324 	vpxor	%xmm0,%xmm3,%xmm3
   1325 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
   1326 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
   1327 	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
   1328 	vpxor	%xmm0,%xmm1,%xmm4
   1329 	vpxor	%xmm4,%xmm3,%xmm3
   1330 
   1331 	vpslldq	$8,%xmm3,%xmm4
   1332 	vpsrldq	$8,%xmm3,%xmm3
   1333 	vpxor	%xmm4,%xmm0,%xmm0
   1334 	vpxor	%xmm3,%xmm1,%xmm1
   1335 	vpsllq	$57,%xmm0,%xmm3
   1336 	vpsllq	$62,%xmm0,%xmm4
   1337 	vpxor	%xmm3,%xmm4,%xmm4
   1338 	vpsllq	$63,%xmm0,%xmm3
   1339 	vpxor	%xmm3,%xmm4,%xmm4
   1340 	vpslldq	$8,%xmm4,%xmm3
   1341 	vpsrldq	$8,%xmm4,%xmm4
   1342 	vpxor	%xmm3,%xmm0,%xmm0
   1343 	vpxor	%xmm4,%xmm1,%xmm1
   1344 
   1345 	vpsrlq	$1,%xmm0,%xmm4
   1346 	vpxor	%xmm0,%xmm1,%xmm1
   1347 	vpxor	%xmm4,%xmm0,%xmm0
   1348 	vpsrlq	$5,%xmm4,%xmm4
   1349 	vpxor	%xmm4,%xmm0,%xmm0
   1350 	vpsrlq	$1,%xmm0,%xmm0
   1351 	vpxor	%xmm1,%xmm0,%xmm0
   1352 	vpshufd	$78,%xmm5,%xmm3
   1353 	vpshufd	$78,%xmm0,%xmm4
   1354 	vpxor	%xmm5,%xmm3,%xmm3
   1355 	vmovdqu	%xmm5,0(%rdi)
   1356 	vpxor	%xmm0,%xmm4,%xmm4
   1357 	vmovdqu	%xmm0,16(%rdi)
   1358 	leaq	48(%rdi),%rdi
   1359 	subq	$1,%r10
   1360 	jnz	L$init_loop_avx
   1361 
   1362 	vpalignr	$8,%xmm4,%xmm3,%xmm5
   1363 	vmovdqu	%xmm5,-16(%rdi)
   1364 
   1365 	vzeroupper
   1366 	.byte	0xf3,0xc3
   1367 
   1368 .globl	_gcm_gmult_avx
   1369 .private_extern _gcm_gmult_avx
   1370 
   1371 .p2align	5
   1372 _gcm_gmult_avx:
   1373 	jmp	L$_gmult_clmul
   1374 
   1375 .globl	_gcm_ghash_avx
   1376 .private_extern _gcm_ghash_avx
   1377 
   1378 .p2align	5
   1379 _gcm_ghash_avx:
   1380 	vzeroupper
   1381 
   1382 	vmovdqu	(%rdi),%xmm10
   1383 	leaq	L$0x1c2_polynomial(%rip),%r10
   1384 	leaq	64(%rsi),%rsi
   1385 	vmovdqu	L$bswap_mask(%rip),%xmm13
   1386 	vpshufb	%xmm13,%xmm10,%xmm10
   1387 	cmpq	$0x80,%rcx
   1388 	jb	L$short_avx
   1389 	subq	$0x80,%rcx
   1390 
   1391 	vmovdqu	112(%rdx),%xmm14
   1392 	vmovdqu	0-64(%rsi),%xmm6
   1393 	vpshufb	%xmm13,%xmm14,%xmm14
   1394 	vmovdqu	32-64(%rsi),%xmm7
   1395 
   1396 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1397 	vmovdqu	96(%rdx),%xmm15
   1398 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1399 	vpxor	%xmm14,%xmm9,%xmm9
   1400 	vpshufb	%xmm13,%xmm15,%xmm15
   1401 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1402 	vmovdqu	16-64(%rsi),%xmm6
   1403 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1404 	vmovdqu	80(%rdx),%xmm14
   1405 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1406 	vpxor	%xmm15,%xmm8,%xmm8
   1407 
   1408 	vpshufb	%xmm13,%xmm14,%xmm14
   1409 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1410 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1411 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1412 	vmovdqu	48-64(%rsi),%xmm6
   1413 	vpxor	%xmm14,%xmm9,%xmm9
   1414 	vmovdqu	64(%rdx),%xmm15
   1415 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1416 	vmovdqu	80-64(%rsi),%xmm7
   1417 
   1418 	vpshufb	%xmm13,%xmm15,%xmm15
   1419 	vpxor	%xmm0,%xmm3,%xmm3
   1420 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1421 	vpxor	%xmm1,%xmm4,%xmm4
   1422 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1423 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1424 	vmovdqu	64-64(%rsi),%xmm6
   1425 	vpxor	%xmm2,%xmm5,%xmm5
   1426 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1427 	vpxor	%xmm15,%xmm8,%xmm8
   1428 
   1429 	vmovdqu	48(%rdx),%xmm14
   1430 	vpxor	%xmm3,%xmm0,%xmm0
   1431 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1432 	vpxor	%xmm4,%xmm1,%xmm1
   1433 	vpshufb	%xmm13,%xmm14,%xmm14
   1434 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1435 	vmovdqu	96-64(%rsi),%xmm6
   1436 	vpxor	%xmm5,%xmm2,%xmm2
   1437 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1438 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1439 	vmovdqu	128-64(%rsi),%xmm7
   1440 	vpxor	%xmm14,%xmm9,%xmm9
   1441 
   1442 	vmovdqu	32(%rdx),%xmm15
   1443 	vpxor	%xmm0,%xmm3,%xmm3
   1444 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1445 	vpxor	%xmm1,%xmm4,%xmm4
   1446 	vpshufb	%xmm13,%xmm15,%xmm15
   1447 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1448 	vmovdqu	112-64(%rsi),%xmm6
   1449 	vpxor	%xmm2,%xmm5,%xmm5
   1450 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1451 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1452 	vpxor	%xmm15,%xmm8,%xmm8
   1453 
   1454 	vmovdqu	16(%rdx),%xmm14
   1455 	vpxor	%xmm3,%xmm0,%xmm0
   1456 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1457 	vpxor	%xmm4,%xmm1,%xmm1
   1458 	vpshufb	%xmm13,%xmm14,%xmm14
   1459 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1460 	vmovdqu	144-64(%rsi),%xmm6
   1461 	vpxor	%xmm5,%xmm2,%xmm2
   1462 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1463 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1464 	vmovdqu	176-64(%rsi),%xmm7
   1465 	vpxor	%xmm14,%xmm9,%xmm9
   1466 
   1467 	vmovdqu	(%rdx),%xmm15
   1468 	vpxor	%xmm0,%xmm3,%xmm3
   1469 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1470 	vpxor	%xmm1,%xmm4,%xmm4
   1471 	vpshufb	%xmm13,%xmm15,%xmm15
   1472 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1473 	vmovdqu	160-64(%rsi),%xmm6
   1474 	vpxor	%xmm2,%xmm5,%xmm5
   1475 	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
   1476 
   1477 	leaq	128(%rdx),%rdx
   1478 	cmpq	$0x80,%rcx
   1479 	jb	L$tail_avx
   1480 
   1481 	vpxor	%xmm10,%xmm15,%xmm15
   1482 	subq	$0x80,%rcx
   1483 	jmp	L$oop8x_avx
   1484 
   1485 .p2align	5
   1486 L$oop8x_avx:
   1487 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1488 	vmovdqu	112(%rdx),%xmm14
   1489 	vpxor	%xmm0,%xmm3,%xmm3
   1490 	vpxor	%xmm15,%xmm8,%xmm8
   1491 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
   1492 	vpshufb	%xmm13,%xmm14,%xmm14
   1493 	vpxor	%xmm1,%xmm4,%xmm4
   1494 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
   1495 	vmovdqu	0-64(%rsi),%xmm6
   1496 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1497 	vpxor	%xmm2,%xmm5,%xmm5
   1498 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
   1499 	vmovdqu	32-64(%rsi),%xmm7
   1500 	vpxor	%xmm14,%xmm9,%xmm9
   1501 
   1502 	vmovdqu	96(%rdx),%xmm15
   1503 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1504 	vpxor	%xmm3,%xmm10,%xmm10
   1505 	vpshufb	%xmm13,%xmm15,%xmm15
   1506 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1507 	vxorps	%xmm4,%xmm11,%xmm11
   1508 	vmovdqu	16-64(%rsi),%xmm6
   1509 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1510 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1511 	vpxor	%xmm5,%xmm12,%xmm12
   1512 	vxorps	%xmm15,%xmm8,%xmm8
   1513 
   1514 	vmovdqu	80(%rdx),%xmm14
   1515 	vpxor	%xmm10,%xmm12,%xmm12
   1516 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1517 	vpxor	%xmm11,%xmm12,%xmm12
   1518 	vpslldq	$8,%xmm12,%xmm9
   1519 	vpxor	%xmm0,%xmm3,%xmm3
   1520 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1521 	vpsrldq	$8,%xmm12,%xmm12
   1522 	vpxor	%xmm9,%xmm10,%xmm10
   1523 	vmovdqu	48-64(%rsi),%xmm6
   1524 	vpshufb	%xmm13,%xmm14,%xmm14
   1525 	vxorps	%xmm12,%xmm11,%xmm11
   1526 	vpxor	%xmm1,%xmm4,%xmm4
   1527 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1528 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1529 	vmovdqu	80-64(%rsi),%xmm7
   1530 	vpxor	%xmm14,%xmm9,%xmm9
   1531 	vpxor	%xmm2,%xmm5,%xmm5
   1532 
   1533 	vmovdqu	64(%rdx),%xmm15
   1534 	vpalignr	$8,%xmm10,%xmm10,%xmm12
   1535 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1536 	vpshufb	%xmm13,%xmm15,%xmm15
   1537 	vpxor	%xmm3,%xmm0,%xmm0
   1538 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1539 	vmovdqu	64-64(%rsi),%xmm6
   1540 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1541 	vpxor	%xmm4,%xmm1,%xmm1
   1542 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1543 	vxorps	%xmm15,%xmm8,%xmm8
   1544 	vpxor	%xmm5,%xmm2,%xmm2
   1545 
   1546 	vmovdqu	48(%rdx),%xmm14
   1547 	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
   1548 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1549 	vpshufb	%xmm13,%xmm14,%xmm14
   1550 	vpxor	%xmm0,%xmm3,%xmm3
   1551 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1552 	vmovdqu	96-64(%rsi),%xmm6
   1553 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1554 	vpxor	%xmm1,%xmm4,%xmm4
   1555 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1556 	vmovdqu	128-64(%rsi),%xmm7
   1557 	vpxor	%xmm14,%xmm9,%xmm9
   1558 	vpxor	%xmm2,%xmm5,%xmm5
   1559 
   1560 	vmovdqu	32(%rdx),%xmm15
   1561 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1562 	vpshufb	%xmm13,%xmm15,%xmm15
   1563 	vpxor	%xmm3,%xmm0,%xmm0
   1564 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1565 	vmovdqu	112-64(%rsi),%xmm6
   1566 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1567 	vpxor	%xmm4,%xmm1,%xmm1
   1568 	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
   1569 	vpxor	%xmm15,%xmm8,%xmm8
   1570 	vpxor	%xmm5,%xmm2,%xmm2
   1571 	vxorps	%xmm12,%xmm10,%xmm10
   1572 
   1573 	vmovdqu	16(%rdx),%xmm14
   1574 	vpalignr	$8,%xmm10,%xmm10,%xmm12
   1575 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
   1576 	vpshufb	%xmm13,%xmm14,%xmm14
   1577 	vpxor	%xmm0,%xmm3,%xmm3
   1578 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
   1579 	vmovdqu	144-64(%rsi),%xmm6
   1580 	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
   1581 	vxorps	%xmm11,%xmm12,%xmm12
   1582 	vpunpckhqdq	%xmm14,%xmm14,%xmm9
   1583 	vpxor	%xmm1,%xmm4,%xmm4
   1584 	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
   1585 	vmovdqu	176-64(%rsi),%xmm7
   1586 	vpxor	%xmm14,%xmm9,%xmm9
   1587 	vpxor	%xmm2,%xmm5,%xmm5
   1588 
   1589 	vmovdqu	(%rdx),%xmm15
   1590 	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
   1591 	vpshufb	%xmm13,%xmm15,%xmm15
   1592 	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
   1593 	vmovdqu	160-64(%rsi),%xmm6
   1594 	vpxor	%xmm12,%xmm15,%xmm15
   1595 	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
   1596 	vpxor	%xmm10,%xmm15,%xmm15
   1597 
   1598 	leaq	128(%rdx),%rdx
   1599 	subq	$0x80,%rcx
   1600 	jnc	L$oop8x_avx
   1601 
   1602 	addq	$0x80,%rcx
   1603 	jmp	L$tail_no_xor_avx
   1604 
   1605 .p2align	5
   1606 L$short_avx:
   1607 	vmovdqu	-16(%rdx,%rcx,1),%xmm14
   1608 	leaq	(%rdx,%rcx,1),%rdx
   1609 	vmovdqu	0-64(%rsi),%xmm6
   1610 	vmovdqu	32-64(%rsi),%xmm7
   1611 	vpshufb	%xmm13,%xmm14,%xmm15
   1612 
   1613 	vmovdqa	%xmm0,%xmm3
   1614 	vmovdqa	%xmm1,%xmm4
   1615 	vmovdqa	%xmm2,%xmm5
   1616 	subq	$0x10,%rcx
   1617 	jz	L$tail_avx
   1618 
   1619 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1620 	vpxor	%xmm0,%xmm3,%xmm3
   1621 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1622 	vpxor	%xmm15,%xmm8,%xmm8
   1623 	vmovdqu	-32(%rdx),%xmm14
   1624 	vpxor	%xmm1,%xmm4,%xmm4
   1625 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1626 	vmovdqu	16-64(%rsi),%xmm6
   1627 	vpshufb	%xmm13,%xmm14,%xmm15
   1628 	vpxor	%xmm2,%xmm5,%xmm5
   1629 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1630 	vpsrldq	$8,%xmm7,%xmm7
   1631 	subq	$0x10,%rcx
   1632 	jz	L$tail_avx
   1633 
   1634 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1635 	vpxor	%xmm0,%xmm3,%xmm3
   1636 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1637 	vpxor	%xmm15,%xmm8,%xmm8
   1638 	vmovdqu	-48(%rdx),%xmm14
   1639 	vpxor	%xmm1,%xmm4,%xmm4
   1640 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1641 	vmovdqu	48-64(%rsi),%xmm6
   1642 	vpshufb	%xmm13,%xmm14,%xmm15
   1643 	vpxor	%xmm2,%xmm5,%xmm5
   1644 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1645 	vmovdqu	80-64(%rsi),%xmm7
   1646 	subq	$0x10,%rcx
   1647 	jz	L$tail_avx
   1648 
   1649 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1650 	vpxor	%xmm0,%xmm3,%xmm3
   1651 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1652 	vpxor	%xmm15,%xmm8,%xmm8
   1653 	vmovdqu	-64(%rdx),%xmm14
   1654 	vpxor	%xmm1,%xmm4,%xmm4
   1655 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1656 	vmovdqu	64-64(%rsi),%xmm6
   1657 	vpshufb	%xmm13,%xmm14,%xmm15
   1658 	vpxor	%xmm2,%xmm5,%xmm5
   1659 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1660 	vpsrldq	$8,%xmm7,%xmm7
   1661 	subq	$0x10,%rcx
   1662 	jz	L$tail_avx
   1663 
   1664 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1665 	vpxor	%xmm0,%xmm3,%xmm3
   1666 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1667 	vpxor	%xmm15,%xmm8,%xmm8
   1668 	vmovdqu	-80(%rdx),%xmm14
   1669 	vpxor	%xmm1,%xmm4,%xmm4
   1670 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1671 	vmovdqu	96-64(%rsi),%xmm6
   1672 	vpshufb	%xmm13,%xmm14,%xmm15
   1673 	vpxor	%xmm2,%xmm5,%xmm5
   1674 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1675 	vmovdqu	128-64(%rsi),%xmm7
   1676 	subq	$0x10,%rcx
   1677 	jz	L$tail_avx
   1678 
   1679 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1680 	vpxor	%xmm0,%xmm3,%xmm3
   1681 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1682 	vpxor	%xmm15,%xmm8,%xmm8
   1683 	vmovdqu	-96(%rdx),%xmm14
   1684 	vpxor	%xmm1,%xmm4,%xmm4
   1685 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1686 	vmovdqu	112-64(%rsi),%xmm6
   1687 	vpshufb	%xmm13,%xmm14,%xmm15
   1688 	vpxor	%xmm2,%xmm5,%xmm5
   1689 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1690 	vpsrldq	$8,%xmm7,%xmm7
   1691 	subq	$0x10,%rcx
   1692 	jz	L$tail_avx
   1693 
   1694 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1695 	vpxor	%xmm0,%xmm3,%xmm3
   1696 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1697 	vpxor	%xmm15,%xmm8,%xmm8
   1698 	vmovdqu	-112(%rdx),%xmm14
   1699 	vpxor	%xmm1,%xmm4,%xmm4
   1700 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1701 	vmovdqu	144-64(%rsi),%xmm6
   1702 	vpshufb	%xmm13,%xmm14,%xmm15
   1703 	vpxor	%xmm2,%xmm5,%xmm5
   1704 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1705 	vmovq	184-64(%rsi),%xmm7
   1706 	subq	$0x10,%rcx
   1707 	jmp	L$tail_avx
   1708 
   1709 .p2align	5
   1710 L$tail_avx:
   1711 	vpxor	%xmm10,%xmm15,%xmm15
   1712 L$tail_no_xor_avx:
   1713 	vpunpckhqdq	%xmm15,%xmm15,%xmm8
   1714 	vpxor	%xmm0,%xmm3,%xmm3
   1715 	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
   1716 	vpxor	%xmm15,%xmm8,%xmm8
   1717 	vpxor	%xmm1,%xmm4,%xmm4
   1718 	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
   1719 	vpxor	%xmm2,%xmm5,%xmm5
   1720 	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
   1721 
   1722 	vmovdqu	(%r10),%xmm12
   1723 
   1724 	vpxor	%xmm0,%xmm3,%xmm10
   1725 	vpxor	%xmm1,%xmm4,%xmm11
   1726 	vpxor	%xmm2,%xmm5,%xmm5
   1727 
   1728 	vpxor	%xmm10,%xmm5,%xmm5
   1729 	vpxor	%xmm11,%xmm5,%xmm5
   1730 	vpslldq	$8,%xmm5,%xmm9
   1731 	vpsrldq	$8,%xmm5,%xmm5
   1732 	vpxor	%xmm9,%xmm10,%xmm10
   1733 	vpxor	%xmm5,%xmm11,%xmm11
   1734 
   1735 	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
   1736 	vpalignr	$8,%xmm10,%xmm10,%xmm10
   1737 	vpxor	%xmm9,%xmm10,%xmm10
   1738 
   1739 	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
   1740 	vpalignr	$8,%xmm10,%xmm10,%xmm10
   1741 	vpxor	%xmm11,%xmm10,%xmm10
   1742 	vpxor	%xmm9,%xmm10,%xmm10
   1743 
   1744 	cmpq	$0,%rcx
   1745 	jne	L$short_avx
   1746 
   1747 	vpshufb	%xmm13,%xmm10,%xmm10
   1748 	vmovdqu	%xmm10,(%rdi)
   1749 	vzeroupper
   1750 	.byte	0xf3,0xc3
   1751 
   1752 .p2align	6
   1753 L$bswap_mask:
   1754 .byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
   1755 L$0x1c2_polynomial:
   1756 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
   1757 L$7_mask:
   1758 .long	7,0,7,0
   1759 L$7_mask_poly:
   1760 .long	7,0,450,0
   1761 .p2align	6
   1762 
   1763 L$rem_4bit:
   1764 .long	0,0,0,471859200,0,943718400,0,610271232
   1765 .long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
   1766 .long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
   1767 .long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
   1768 
   1769 L$rem_8bit:
   1770 .value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
   1771 .value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
   1772 .value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
   1773 .value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
   1774 .value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
   1775 .value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
   1776 .value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
   1777 .value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
   1778 .value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
   1779 .value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
   1780 .value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
   1781 .value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
   1782 .value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
   1783 .value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
   1784 .value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
   1785 .value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
   1786 .value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
   1787 .value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
   1788 .value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
   1789 .value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
   1790 .value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
   1791 .value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
   1792 .value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
   1793 .value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
   1794 .value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
   1795 .value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
   1796 .value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
   1797 .value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
   1798 .value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
   1799 .value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
   1800 .value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
   1801 .value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
   1802 
   1803 .byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
   1804 .p2align	6
   1805 #endif
   1806