Home | History | Annotate | Download | only in patches
      1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001
      2 From: Adam Langley <agl (a] chromium.org>
      3 Date: Tue, 5 Nov 2013 13:10:11 -0500
      4 Subject: [PATCH 52/52] Optional NEON support on ARM.
      5 
      6 This patch causes ARM to build both the NEON and generic versions of
      7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by
      8 calling CRYPTO_set_NEON_capable(1).
      9 ---
     10  .gitignore                     |   1 +
     11  Configure                      |   2 +-
     12  apps/speed.c                   |   5 +
     13  crypto/chacha/chacha_enc.c     |  18 +
     14  crypto/chacha/chacha_vec.c     |   7 +
     15  crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++
     16  crypto/cryptlib.c              |  14 +
     17  crypto/crypto.h                |   8 +
     18  crypto/poly1305/poly1305.c     |  35 ++
     19  crypto/poly1305/poly1305_arm.c |   9 +-
     20  10 files changed, 941 insertions(+), 4 deletions(-)
     21  create mode 100644 crypto/chacha/chacha_vec_arm.s
     22 
     23 diff --git a/Configure b/Configure
     24 index 1b95384..18b7af0 100755
     25 --- a/Configure
     26 +++ b/Configure
     27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a
     28  my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::::";
     29  my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::::";
     30  my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::::ghash-s390x.o:";
     31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void";
     32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm.o:void";
     33  my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::32";
     34  my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::64";
     35  my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::";
     36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c
     37 index 54d1ca3..e4b648f 100644
     38 --- a/crypto/chacha/chacha_enc.c
     39 +++ b/crypto/chacha/chacha_enc.c
     40 @@ -61,6 +61,7 @@
     41  
     42  #if !defined(OPENSSL_NO_CHACHA)
     43  
     44 +#include <openssl/crypto.h>
     45  #include <openssl/chacha.h>
     46  
     47  /* sigma contains the ChaCha constants, which happen to be an ASCII string. */
     48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k";
     49  
     50  typedef unsigned int uint32_t;
     51  
     52 +#if __arm__
     53 +/* Defined in chacha_vec.c */
     54 +void CRYPTO_chacha_20_neon(unsigned char *out,
     55 +		           const unsigned char *in, size_t in_len,
     56 +		           const unsigned char key[32],
     57 +		           const unsigned char nonce[8],
     58 +		           size_t counter);
     59 +#endif
     60 +
     61  /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in
     62   * |input| and writes the 64 output bytes to |output|. */
     63  static void chacha_core(unsigned char output[64], const uint32_t input[16],
     64 @@ -124,6 +134,16 @@ void CRYPTO_chacha_20(unsigned char *out,
     65  	unsigned char buf[64];
     66  	size_t todo, i;
     67  
     68 +#if __arm__
     69 +	if (CRYPTO_is_NEON_capable() &&
     70 +	    ((intptr_t)in & 15) == 0 &&
     71 +	    ((intptr_t)out & 15) == 0)
     72 +		{
     73 +		CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter);
     74 +		return;
     75 +		}
     76 +#endif
     77 +
     78  	input[0] = U8TO32_LITTLE(sigma + 0);
     79  	input[1] = U8TO32_LITTLE(sigma + 4);
     80  	input[2] = U8TO32_LITTLE(sigma + 8);
     81 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c
     82 index 33b2238..1226c39 100644
     83 --- a/crypto/chacha/chacha_vec.c
     84 +++ b/crypto/chacha/chacha_vec.c
     85 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16)));
     86  	STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2));      \
     87  	STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3));
     88  
     89 +#if __ARM_NEON__
     90 +/* For ARM, we can't depend on NEON support, so this function is compiled with
     91 + * a different name, along with the generic code, and can be enabled at
     92 + * run-time. */
     93 +void CRYPTO_chacha_20_neon(
     94 +#else
     95  void CRYPTO_chacha_20(
     96 +#endif
     97  	unsigned char *out,
     98  	const unsigned char *in,
     99  	size_t inlen,
    100 diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S
    101 new file mode 100644
    102 index 0000000..24a5050
    103 --- /dev/null
    104 +++ b/crypto/chacha/chacha_vec_arm.S
    105 @@ -0,0 +1,863 @@
    106 +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is
    107 +# needed to support switching on NEON code at runtime. If the whole of OpenSSL
    108 +# were to be compiled with the needed flags to build chacha_vec.c, then it
    109 +# wouldn't be possible to run on non-NEON systems.
    110 +#
    111 +# This file was generated by:
    112 +#
    113 +#     /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-linux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../include -fpic -o chacha_vec_arm.S
    114 +#
    115 +# And then EABI attribute 28 was set to zero to allow linking with soft-float
    116 +# code.
    117 +
    118 +	.syntax unified
    119 +	.cpu cortex-a8
    120 +	.eabi_attribute 27, 3
    121 +	.eabi_attribute 28, 0
    122 +	.fpu neon
    123 +	.eabi_attribute 20, 1
    124 +	.eabi_attribute 21, 1
    125 +	.eabi_attribute 23, 3
    126 +	.eabi_attribute 24, 1
    127 +	.eabi_attribute 25, 1
    128 +	.eabi_attribute 26, 2
    129 +	.eabi_attribute 30, 2
    130 +	.eabi_attribute 34, 1
    131 +	.eabi_attribute 18, 4
    132 +	.thumb
    133 +	.file	"chacha_vec.c"
    134 +	.text
    135 +	.align	2
    136 +	.global	CRYPTO_chacha_20_neon
    137 +	.thumb
    138 +	.thumb_func
    139 +	.type	CRYPTO_chacha_20_neon, %function
    140 +CRYPTO_chacha_20_neon:
    141 +	@ args = 8, pretend = 0, frame = 304
    142 +	@ frame_needed = 1, uses_anonymous_args = 0
    143 +	@ link register save eliminated.
    144 +	push	{r4, r5, r6, r7, r8, r9, sl, fp}
    145 +	fstmfdd	sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
    146 +	sub	sp, sp, #304
    147 +	add	r7, sp, #0
    148 +	movw	ip, #43691
    149 +	movt	ip, 43690
    150 +	str	r2, [r7, #196]
    151 +	sub	sp, sp, #96
    152 +	ldr	r4, [r7, #196]
    153 +	ldr	r6, [r7, #400]
    154 +	ldr	r2, .L38+16
    155 +	umull	r4, ip, ip, r4
    156 +	ldr	r6, [r6, #0]
    157 +	ldr	r8, [r7, #400]
    158 +.LPIC24:
    159 +	add	r2, pc
    160 +	add	r4, sp, #15
    161 +	str	r3, [r7, #244]
    162 +	str	r6, [r7, #176]
    163 +	bic	r4, r4, #15
    164 +	str	r0, [r7, #188]
    165 +	str	r4, [r7, #200]
    166 +	lsrs	ip, ip, #7
    167 +	str	r1, [r7, #184]
    168 +	ldmia	r2, {r0, r1, r2, r3}
    169 +	ldr	r4, [r8, #4]
    170 +	ldr	r5, [r7, #244]
    171 +	vld1.64	{d24-d25}, [r5:64]
    172 +	vldr	d26, [r5, #16]
    173 +	vldr	d27, [r5, #24]
    174 +	ldr	r9, [r7, #200]
    175 +	ldr	r8, [r7, #404]
    176 +	ldr	r5, [r7, #176]
    177 +	add	r6, r9, #64
    178 +	str	r4, [r7, #300]
    179 +	mov	r4, #0
    180 +	str	r8, [r7, #288]
    181 +	str	r5, [r7, #296]
    182 +	str	r4, [r7, #292]
    183 +	stmia	r6, {r0, r1, r2, r3}
    184 +	vldr	d22, [r9, #64]
    185 +	vldr	d23, [r9, #72]
    186 +	vldr	d20, [r7, #288]
    187 +	vldr	d21, [r7, #296]
    188 +	str	ip, [r7, #192]
    189 +	beq	.L20
    190 +	lsl	r6, ip, #1
    191 +	ldr	r1, [r9, #68]
    192 +	add	r3, r6, ip
    193 +	str	r6, [r7, #180]
    194 +	ldr	r2, [r9, #72]
    195 +	add	r8, r8, #2
    196 +	ldr	r5, [r9, #76]
    197 +	vldr	d18, .L38
    198 +	vldr	d19, .L38+8
    199 +	str	r4, [r7, #240]
    200 +	ldr	r6, [r7, #184]
    201 +	ldr	r4, [r7, #188]
    202 +	str	r0, [r7, #224]
    203 +	str	r1, [r7, #220]
    204 +	str	r8, [r7, #208]
    205 +	str	r2, [r7, #216]
    206 +	str	r3, [r7, #204]
    207 +	str	r5, [r7, #212]
    208 +	str	r6, [r7, #252]
    209 +	str	r4, [r7, #248]
    210 +.L4:
    211 +	ldr	r2, [r7, #244]
    212 +	add	r9, r7, #216
    213 +	ldr	r3, [r7, #244]
    214 +	vadd.i32	q8, q10, q9
    215 +	ldr	r6, [r7, #208]
    216 +	vmov	q15, q13  @ v4si
    217 +	ldr	r5, [r7, #240]
    218 +	vmov	q3, q12  @ v4si
    219 +	ldr	r4, [r7, #244]
    220 +	vmov	q2, q11  @ v4si
    221 +	adds	r5, r5, r6
    222 +	ldr	r2, [r2, #8]
    223 +	ldr	r6, [r7, #400]
    224 +	vmov	q5, q10  @ v4si
    225 +	ldr	r3, [r3, #12]
    226 +	vmov	q1, q13  @ v4si
    227 +	ldr	r0, [r7, #244]
    228 +	vmov	q0, q12  @ v4si
    229 +	ldr	r1, [r7, #244]
    230 +	vmov	q4, q11  @ v4si
    231 +	ldmia	r9, {r9, sl, fp}
    232 +	str	r5, [r7, #228]
    233 +	ldr	r5, [r4, #24]
    234 +	ldr	r0, [r0, #0]
    235 +	ldr	r1, [r1, #4]
    236 +	str	r2, [r7, #264]
    237 +	str	r3, [r7, #236]
    238 +	ldr	r2, [r6, #4]
    239 +	ldr	r3, [r4, #28]
    240 +	str	r5, [r7, #280]
    241 +	ldr	r5, [r6, #0]
    242 +	movs	r6, #0
    243 +	ldr	ip, [r7, #228]
    244 +	ldr	r8, [r7, #212]
    245 +	str	r0, [r7, #232]
    246 +	str	r1, [r7, #268]
    247 +	ldr	r0, [r4, #16]
    248 +	ldr	r1, [r4, #20]
    249 +	movs	r4, #10
    250 +	str	r2, [r7, #24]
    251 +	str	r3, [r7, #284]
    252 +	str	r4, [r7, #256]
    253 +	ldr	r2, [r7, #264]
    254 +	str	r9, [r7, #276]
    255 +	mov	r9, r6
    256 +	ldr	r6, [r7, #280]
    257 +	str	r8, [r7, #260]
    258 +	mov	r8, sl
    259 +	str	r1, [r7, #272]
    260 +	mov	sl, ip
    261 +	str	r6, [r7, #264]
    262 +	mov	r6, r5
    263 +	ldr	r3, [r7, #236]
    264 +	mov	r5, r0
    265 +	ldr	ip, [r7, #24]
    266 +	ldr	r1, [r7, #268]
    267 +	ldr	r0, [r7, #232]
    268 +	b	.L39
    269 +.L40:
    270 +	.align	3
    271 +.L38:
    272 +	.word	1
    273 +	.word	0
    274 +	.word	0
    275 +	.word	0
    276 +	.word	.LANCHOR0-(.LPIC24+4)
    277 +.L39:
    278 +.L3:
    279 +	vadd.i32	q4, q4, q0
    280 +	add	r8, r8, r1
    281 +	vadd.i32	q2, q2, q3
    282 +	str	r8, [r7, #268]
    283 +	veor	q5, q5, q4
    284 +	ldr	r8, [r7, #276]
    285 +	veor	q8, q8, q2
    286 +	add	fp, fp, r0
    287 +	str	fp, [r7, #280]
    288 +	add	r8, r8, r2
    289 +	vrev32.16	q5, q5
    290 +	str	r8, [r7, #276]
    291 +	vrev32.16	q8, q8
    292 +	vadd.i32	q1, q1, q5
    293 +	vadd.i32	q15, q15, q8
    294 +	ldr	r8, [r7, #280]
    295 +	veor	q0, q1, q0
    296 +	ldr	r4, [r7, #260]
    297 +	veor	q3, q15, q3
    298 +	eor	sl, sl, r8
    299 +	ldr	r8, [r7, #276]
    300 +	add	fp, r4, r3
    301 +	vshl.i32	q7, q0, #12
    302 +	ldr	r4, [r7, #268]
    303 +	vshl.i32	q6, q3, #12
    304 +	eor	r6, r6, r8
    305 +	eor	r9, r9, r4
    306 +	ldr	r4, [r7, #272]
    307 +	vsri.32	q7, q0, #20
    308 +	ror	r8, r6, #16
    309 +	ldr	r6, [r7, #264]
    310 +	eor	ip, ip, fp
    311 +	vsri.32	q6, q3, #20
    312 +	ror	sl, sl, #16
    313 +	ror	r9, r9, #16
    314 +	add	r5, r5, sl
    315 +	vadd.i32	q4, q4, q7
    316 +	str	r5, [r7, #236]
    317 +	vadd.i32	q2, q2, q6
    318 +	add	r5, r4, r9
    319 +	add	r4, r6, r8
    320 +	ldr	r6, [r7, #284]
    321 +	ror	ip, ip, #16
    322 +	veor	q5, q4, q5
    323 +	veor	q8, q2, q8
    324 +	add	r6, r6, ip
    325 +	str	r6, [r7, #264]
    326 +	eors	r1, r1, r5
    327 +	ldr	r6, [r7, #236]
    328 +	vshl.i32	q3, q5, #8
    329 +	vshl.i32	q14, q8, #8
    330 +	eors	r2, r2, r4
    331 +	eors	r0, r0, r6
    332 +	ldr	r6, [r7, #264]
    333 +	vsri.32	q3, q5, #24
    334 +	ror	r1, r1, #20
    335 +	eors	r3, r3, r6
    336 +	ldr	r6, [r7, #280]
    337 +	ror	r0, r0, #20
    338 +	vsri.32	q14, q8, #24
    339 +	adds	r6, r0, r6
    340 +	str	r6, [r7, #284]
    341 +	ldr	r6, [r7, #268]
    342 +	vadd.i32	q1, q1, q3
    343 +	vadd.i32	q15, q15, q14
    344 +	ror	r2, r2, #20
    345 +	adds	r6, r1, r6
    346 +	str	r6, [r7, #260]
    347 +	ldr	r6, [r7, #276]
    348 +	veor	q6, q15, q6
    349 +	veor	q7, q1, q7
    350 +	ror	r3, r3, #20
    351 +	adds	r6, r2, r6
    352 +	str	r6, [r7, #280]
    353 +	ldr	r6, [r7, #284]
    354 +	vshl.i32	q0, q6, #7
    355 +	vshl.i32	q5, q7, #7
    356 +	add	fp, r3, fp
    357 +	eor	sl, r6, sl
    358 +	ldr	r6, [r7, #260]
    359 +	eor	ip, fp, ip
    360 +	vsri.32	q0, q6, #25
    361 +	eor	r9, r6, r9
    362 +	ldr	r6, [r7, #280]
    363 +	ror	sl, sl, #24
    364 +	vsri.32	q5, q7, #25
    365 +	eor	r8, r6, r8
    366 +	ldr	r6, [r7, #236]
    367 +	ror	r9, r9, #24
    368 +	ror	ip, ip, #24
    369 +	add	r6, sl, r6
    370 +	str	r6, [r7, #276]
    371 +	ldr	r6, [r7, #264]
    372 +	add	r5, r9, r5
    373 +	str	r5, [r7, #272]
    374 +	vext.32	q5, q5, q5, #1
    375 +	add	r5, ip, r6
    376 +	ldr	r6, [r7, #276]
    377 +	vext.32	q0, q0, q0, #1
    378 +	vadd.i32	q4, q4, q5
    379 +	eors	r0, r0, r6
    380 +	ldr	r6, [r7, #272]
    381 +	vadd.i32	q2, q2, q0
    382 +	vext.32	q3, q3, q3, #3
    383 +	ror	r8, r8, #24
    384 +	eors	r1, r1, r6
    385 +	vext.32	q14, q14, q14, #3
    386 +	add	r4, r8, r4
    387 +	ldr	r6, [r7, #284]
    388 +	veor	q3, q4, q3
    389 +	veor	q14, q2, q14
    390 +	eors	r2, r2, r4
    391 +	ror	r1, r1, #25
    392 +	vext.32	q1, q1, q1, #2
    393 +	adds	r6, r1, r6
    394 +	str	r6, [r7, #284]
    395 +	vext.32	q15, q15, q15, #2
    396 +	ldr	r6, [r7, #260]
    397 +	eors	r3, r3, r5
    398 +	ror	r2, r2, #25
    399 +	vrev32.16	q8, q14
    400 +	adds	r6, r2, r6
    401 +	vrev32.16	q3, q3
    402 +	str	r6, [r7, #268]
    403 +	vadd.i32	q1, q1, q3
    404 +	ldr	r6, [r7, #280]
    405 +	vadd.i32	q15, q15, q8
    406 +	ror	r3, r3, #25
    407 +	veor	q5, q1, q5
    408 +	adds	r6, r3, r6
    409 +	veor	q0, q15, q0
    410 +	str	r6, [r7, #264]
    411 +	ldr	r6, [r7, #268]
    412 +	ror	r0, r0, #25
    413 +	add	fp, r0, fp
    414 +	vshl.i32	q6, q5, #12
    415 +	eor	sl, r6, sl
    416 +	ldr	r6, [r7, #284]
    417 +	vshl.i32	q14, q0, #12
    418 +	eor	r8, fp, r8
    419 +	eor	ip, r6, ip
    420 +	ldr	r6, [r7, #264]
    421 +	vsri.32	q6, q5, #20
    422 +	ror	sl, sl, #16
    423 +	eor	r9, r6, r9
    424 +	ror	r6, r8, #16
    425 +	vsri.32	q14, q0, #20
    426 +	ldr	r8, [r7, #272]
    427 +	ror	ip, ip, #16
    428 +	add	r5, sl, r5
    429 +	add	r8, r6, r8
    430 +	add	r4, ip, r4
    431 +	str	r4, [r7, #236]
    432 +	eor	r0, r8, r0
    433 +	str	r5, [r7, #280]
    434 +	vadd.i32	q4, q4, q6
    435 +	ldr	r5, [r7, #236]
    436 +	vadd.i32	q2, q2, q14
    437 +	ldr	r4, [r7, #276]
    438 +	ror	r0, r0, #20
    439 +	veor	q3, q4, q3
    440 +	eors	r1, r1, r5
    441 +	veor	q0, q2, q8
    442 +	str	r8, [r7, #272]
    443 +	str	r0, [r7, #24]
    444 +	add	fp, r0, fp
    445 +	ldr	r8, [r7, #280]
    446 +	ror	r9, r9, #16
    447 +	ldr	r0, [r7, #284]
    448 +	add	r4, r9, r4
    449 +	str	fp, [r7, #260]
    450 +	ror	r1, r1, #20
    451 +	add	fp, r1, r0
    452 +	eor	r2, r8, r2
    453 +	ldr	r0, [r7, #260]
    454 +	eors	r3, r3, r4
    455 +	vshl.i32	q5, q3, #8
    456 +	str	r4, [r7, #232]
    457 +	vshl.i32	q8, q0, #8
    458 +	ldr	r4, [r7, #268]
    459 +	ldr	r5, [r7, #264]
    460 +	ror	r2, r2, #20
    461 +	ror	r3, r3, #20
    462 +	eors	r6, r6, r0
    463 +	adds	r5, r3, r5
    464 +	add	r8, r2, r4
    465 +	vsri.32	q5, q3, #24
    466 +	ldr	r4, [r7, #272]
    467 +	eor	r9, r5, r9
    468 +	eor	ip, fp, ip
    469 +	vsri.32	q8, q0, #24
    470 +	eor	sl, r8, sl
    471 +	ror	r6, r6, #24
    472 +	ldr	r0, [r7, #280]
    473 +	str	r5, [r7, #276]
    474 +	adds	r4, r6, r4
    475 +	ldr	r5, [r7, #236]
    476 +	vadd.i32	q1, q1, q5
    477 +	str	r4, [r7, #272]
    478 +	vadd.i32	q15, q15, q8
    479 +	ldr	r4, [r7, #232]
    480 +	ror	ip, ip, #24
    481 +	ror	sl, sl, #24
    482 +	ror	r9, r9, #24
    483 +	add	r5, ip, r5
    484 +	add	r0, sl, r0
    485 +	str	r5, [r7, #264]
    486 +	add	r5, r9, r4
    487 +	str	r0, [r7, #284]
    488 +	veor	q6, q1, q6
    489 +	ldr	r4, [r7, #24]
    490 +	veor	q14, q15, q14
    491 +	ldr	r0, [r7, #272]
    492 +	eors	r3, r3, r5
    493 +	vshl.i32	q0, q6, #7
    494 +	vext.32	q1, q1, q1, #2
    495 +	eors	r0, r0, r4
    496 +	ldr	r4, [r7, #284]
    497 +	str	r0, [r7, #280]
    498 +	vshl.i32	q3, q14, #7
    499 +	eors	r2, r2, r4
    500 +	ldr	r4, [r7, #280]
    501 +	ldr	r0, [r7, #264]
    502 +	vsri.32	q0, q6, #25
    503 +	ror	r2, r2, #25
    504 +	ror	r3, r3, #25
    505 +	eors	r1, r1, r0
    506 +	vsri.32	q3, q14, #25
    507 +	ror	r0, r4, #25
    508 +	ldr	r4, [r7, #256]
    509 +	ror	r1, r1, #25
    510 +	vext.32	q5, q5, q5, #1
    511 +	subs	r4, r4, #1
    512 +	str	r4, [r7, #256]
    513 +	vext.32	q15, q15, q15, #2
    514 +	vext.32	q8, q8, q8, #1
    515 +	vext.32	q0, q0, q0, #3
    516 +	vext.32	q3, q3, q3, #3
    517 +	bne	.L3
    518 +	ldr	r4, [r7, #264]
    519 +	vadd.i32	q14, q10, q9
    520 +	str	r2, [r7, #264]
    521 +	vadd.i32	q10, q10, q5
    522 +	ldr	r2, [r7, #252]
    523 +	vld1.64	{d12-d13}, [r2:64]
    524 +	ldr	r2, [r7, #220]
    525 +	vadd.i32	q4, q11, q4
    526 +	str	ip, [r7, #24]
    527 +	mov	ip, sl
    528 +	mov	sl, r8
    529 +	ldr	r8, [r7, #260]
    530 +	add	sl, sl, r2
    531 +	ldr	r2, [r7, #212]
    532 +	str	r4, [r7, #280]
    533 +	vadd.i32	q0, q12, q0
    534 +	ldr	r4, [r7, #224]
    535 +	add	r8, r8, r2
    536 +	ldr	r2, [r7, #240]
    537 +	vadd.i32	q1, q13, q1
    538 +	str	r0, [r7, #232]
    539 +	add	fp, fp, r4
    540 +	mov	r0, r5
    541 +	ldr	r4, [r7, #216]
    542 +	mov	r5, r6
    543 +	mov	r6, r9
    544 +	ldr	r9, [r7, #276]
    545 +	adds	r2, r2, #3
    546 +	str	r2, [r7, #240]
    547 +	vadd.i32	q2, q11, q2
    548 +	ldr	r2, [r7, #252]
    549 +	add	r9, r9, r4
    550 +	vadd.i32	q3, q12, q3
    551 +	ldr	r4, [r7, #228]
    552 +	vadd.i32	q15, q13, q15
    553 +	str	r1, [r7, #268]
    554 +	vadd.i32	q8, q14, q8
    555 +	str	r3, [r7, #236]
    556 +	veor	q4, q4, q6
    557 +	ldr	r3, [r7, #284]
    558 +	ldr	r1, [r7, #272]
    559 +	add	ip, r4, ip
    560 +	ldr	r4, [r7, #248]
    561 +	vst1.64	{d8-d9}, [r4:64]
    562 +	vldr	d8, [r2, #16]
    563 +	vldr	d9, [r2, #24]
    564 +	veor	q0, q0, q4
    565 +	vstr	d0, [r4, #16]
    566 +	vstr	d1, [r4, #24]
    567 +	vldr	d0, [r2, #32]
    568 +	vldr	d1, [r2, #40]
    569 +	veor	q1, q1, q0
    570 +	vstr	d2, [r4, #32]
    571 +	vstr	d3, [r4, #40]
    572 +	vldr	d2, [r2, #48]
    573 +	vldr	d3, [r2, #56]
    574 +	veor	q10, q10, q1
    575 +	vstr	d20, [r4, #48]
    576 +	vstr	d21, [r4, #56]
    577 +	vldr	d8, [r2, #64]
    578 +	vldr	d9, [r2, #72]
    579 +	veor	q2, q2, q4
    580 +	vstr	d4, [r4, #64]
    581 +	vstr	d5, [r4, #72]
    582 +	vldr	d10, [r2, #80]
    583 +	vldr	d11, [r2, #88]
    584 +	veor	q3, q3, q5
    585 +	vstr	d6, [r4, #80]
    586 +	vstr	d7, [r4, #88]
    587 +	vldr	d12, [r2, #96]
    588 +	vldr	d13, [r2, #104]
    589 +	veor	q15, q15, q6
    590 +	vstr	d30, [r4, #96]
    591 +	vstr	d31, [r4, #104]
    592 +	vldr	d20, [r2, #112]
    593 +	vldr	d21, [r2, #120]
    594 +	veor	q8, q8, q10
    595 +	vstr	d16, [r4, #112]
    596 +	vstr	d17, [r4, #120]
    597 +	ldr	r4, [r2, #128]
    598 +	ldr	r2, [r7, #248]
    599 +	vadd.i32	q10, q14, q9
    600 +	eor	r4, fp, r4
    601 +	vadd.i32	q10, q10, q9
    602 +	str	r4, [r2, #128]
    603 +	ldr	r4, [r7, #252]
    604 +	ldr	r2, [r4, #132]
    605 +	eor	r2, sl, r2
    606 +	ldr	sl, [r7, #248]
    607 +	str	r2, [sl, #132]
    608 +	ldr	r2, [r4, #136]
    609 +	eor	r2, r9, r2
    610 +	str	r2, [sl, #136]
    611 +	ldr	r2, [r4, #140]
    612 +	eor	r2, r8, r2
    613 +	str	r2, [sl, #140]
    614 +	ldr	r2, [r7, #244]
    615 +	ldr	r4, [r4, #144]
    616 +	ldr	r2, [r2, #0]
    617 +	str	r4, [r7, #44]
    618 +	ldr	r4, [r7, #232]
    619 +	add	r8, r4, r2
    620 +	ldr	r2, [r7, #44]
    621 +	ldr	r4, [r7, #244]
    622 +	eor	r8, r8, r2
    623 +	ldr	r2, [r7, #252]
    624 +	str	r8, [sl, #144]
    625 +	ldr	r4, [r4, #4]
    626 +	ldr	r2, [r2, #148]
    627 +	str	r2, [r7, #40]
    628 +	ldr	r2, [r7, #268]
    629 +	add	r8, r2, r4
    630 +	ldr	r4, [r7, #40]
    631 +	ldr	r2, [r7, #244]
    632 +	eor	r8, r8, r4
    633 +	ldr	r4, [r7, #252]
    634 +	str	r8, [sl, #148]
    635 +	ldr	r2, [r2, #8]
    636 +	ldr	r4, [r4, #152]
    637 +	str	r4, [r7, #36]
    638 +	ldr	r4, [r7, #264]
    639 +	add	r8, r4, r2
    640 +	ldr	r2, [r7, #36]
    641 +	eor	r8, r8, r2
    642 +	str	r8, [sl, #152]
    643 +	ldr	r2, [r7, #252]
    644 +	ldr	r4, [r7, #244]
    645 +	ldr	r2, [r2, #156]
    646 +	ldr	r4, [r4, #12]
    647 +	str	r2, [r7, #32]
    648 +	ldr	r2, [r7, #236]
    649 +	add	r8, r2, r4
    650 +	ldr	r4, [r7, #32]
    651 +	ldr	r2, [r7, #252]
    652 +	eor	r8, r8, r4
    653 +	str	r8, [sl, #156]
    654 +	ldr	r8, [r7, #244]
    655 +	ldr	r2, [r2, #160]
    656 +	ldr	r4, [r8, #16]
    657 +	adds	r0, r0, r4
    658 +	ldr	r4, [r7, #252]
    659 +	eors	r0, r0, r2
    660 +	str	r0, [sl, #160]
    661 +	ldr	r0, [r8, #20]
    662 +	ldr	r2, [r4, #164]
    663 +	adds	r1, r1, r0
    664 +	ldr	r0, [r7, #280]
    665 +	eors	r1, r1, r2
    666 +	str	r1, [sl, #164]
    667 +	ldr	r2, [r8, #24]
    668 +	ldr	r1, [r4, #168]
    669 +	adds	r2, r0, r2
    670 +	eors	r2, r2, r1
    671 +	str	r2, [sl, #168]
    672 +	ldr	r1, [r8, #28]
    673 +	ldr	r2, [r4, #172]
    674 +	adds	r3, r3, r1
    675 +	eors	r3, r3, r2
    676 +	str	r3, [sl, #172]
    677 +	ldr	r3, [r4, #176]
    678 +	eor	r3, ip, r3
    679 +	str	r3, [sl, #176]
    680 +	ldr	r3, [r4, #180]
    681 +	ldr	r4, [r7, #400]
    682 +	eors	r6, r6, r3
    683 +	str	r6, [sl, #180]
    684 +	ldr	r6, [r7, #252]
    685 +	ldr	r2, [r4, #0]
    686 +	ldr	r3, [r6, #184]
    687 +	adds	r5, r5, r2
    688 +	eors	r5, r5, r3
    689 +	str	r5, [sl, #184]
    690 +	ldr	r2, [r6, #188]
    691 +	adds	r6, r6, #192
    692 +	ldr	r3, [r4, #4]
    693 +	str	r6, [r7, #252]
    694 +	ldr	r0, [r7, #24]
    695 +	ldr	r1, [r7, #240]
    696 +	adds	r4, r0, r3
    697 +	eors	r4, r4, r2
    698 +	ldr	r2, [r7, #204]
    699 +	str	r4, [sl, #188]
    700 +	add	sl, sl, #192
    701 +	cmp	r1, r2
    702 +	str	sl, [r7, #248]
    703 +	bne	.L4
    704 +	ldr	r4, [r7, #192]
    705 +	ldr	r3, [r7, #180]
    706 +	ldr	r6, [r7, #188]
    707 +	adds	r5, r3, r4
    708 +	ldr	r8, [r7, #184]
    709 +	lsls	r5, r5, #6
    710 +	adds	r4, r6, r5
    711 +	add	r5, r8, r5
    712 +.L2:
    713 +	ldr	r9, [r7, #196]
    714 +	movw	r3, #43691
    715 +	movt	r3, 43690
    716 +	ldr	sl, [r7, #196]
    717 +	umull	r9, r3, r3, r9
    718 +	lsrs	r3, r3, #7
    719 +	add	r3, r3, r3, lsl #1
    720 +	sub	r3, sl, r3, lsl #6
    721 +	lsrs	r6, r3, #6
    722 +	beq	.L5
    723 +	add	r1, r5, #16
    724 +	add	r2, r4, #16
    725 +	mov	r0, r6
    726 +	vldr	d30, .L41
    727 +	vldr	d31, .L41+8
    728 +.L6:
    729 +	vmov	q8, q10  @ v4si
    730 +	movs	r3, #10
    731 +	vmov	q1, q13  @ v4si
    732 +	vmov	q14, q12  @ v4si
    733 +	vmov	q3, q11  @ v4si
    734 +.L7:
    735 +	vadd.i32	q3, q3, q14
    736 +	subs	r3, r3, #1
    737 +	veor	q2, q8, q3
    738 +	vrev32.16	q2, q2
    739 +	vadd.i32	q8, q1, q2
    740 +	veor	q9, q8, q14
    741 +	vshl.i32	q14, q9, #12
    742 +	vsri.32	q14, q9, #20
    743 +	vadd.i32	q3, q3, q14
    744 +	veor	q2, q3, q2
    745 +	vshl.i32	q9, q2, #8
    746 +	vsri.32	q9, q2, #24
    747 +	vadd.i32	q8, q8, q9
    748 +	vext.32	q9, q9, q9, #3
    749 +	veor	q14, q8, q14
    750 +	vext.32	q1, q8, q8, #2
    751 +	vshl.i32	q8, q14, #7
    752 +	vsri.32	q8, q14, #25
    753 +	vext.32	q8, q8, q8, #1
    754 +	vadd.i32	q3, q3, q8
    755 +	veor	q2, q3, q9
    756 +	vrev32.16	q2, q2
    757 +	vadd.i32	q9, q1, q2
    758 +	veor	q8, q9, q8
    759 +	vshl.i32	q14, q8, #12
    760 +	vsri.32	q14, q8, #20
    761 +	vadd.i32	q3, q3, q14
    762 +	veor	q2, q3, q2
    763 +	vshl.i32	q8, q2, #8
    764 +	vsri.32	q8, q2, #24
    765 +	vadd.i32	q9, q9, q8
    766 +	vext.32	q8, q8, q8, #1
    767 +	veor	q14, q9, q14
    768 +	vext.32	q1, q9, q9, #2
    769 +	vshl.i32	q9, q14, #7
    770 +	vsri.32	q9, q14, #25
    771 +	vext.32	q14, q9, q9, #3
    772 +	bne	.L7
    773 +	vadd.i32	q8, q10, q8
    774 +	subs	r0, r0, #1
    775 +	vadd.i32	q3, q11, q3
    776 +	vldr	d0, [r1, #-16]
    777 +	vldr	d1, [r1, #-8]
    778 +	vadd.i32	q14, q12, q14
    779 +	vadd.i32	q1, q13, q1
    780 +	veor	q3, q3, q0
    781 +	vstr	d6, [r2, #-16]
    782 +	vstr	d7, [r2, #-8]
    783 +	vadd.i32	q10, q10, q15
    784 +	vld1.64	{d8-d9}, [r1:64]
    785 +	veor	q14, q14, q4
    786 +	vst1.64	{d28-d29}, [r2:64]
    787 +	vldr	d10, [r1, #16]
    788 +	vldr	d11, [r1, #24]
    789 +	veor	q1, q1, q5
    790 +	vstr	d2, [r2, #16]
    791 +	vstr	d3, [r2, #24]
    792 +	vldr	d18, [r1, #32]
    793 +	vldr	d19, [r1, #40]
    794 +	add	r1, r1, #64
    795 +	veor	q8, q8, q9
    796 +	vstr	d16, [r2, #32]
    797 +	vstr	d17, [r2, #40]
    798 +	add	r2, r2, #64
    799 +	bne	.L6
    800 +	lsls	r6, r6, #6
    801 +	adds	r4, r4, r6
    802 +	adds	r5, r5, r6
    803 +.L5:
    804 +	ldr	r6, [r7, #196]
    805 +	ands	ip, r6, #63
    806 +	beq	.L1
    807 +	vmov	q8, q10  @ v4si
    808 +	movs	r3, #10
    809 +	vmov	q14, q13  @ v4si
    810 +	vmov	q9, q12  @ v4si
    811 +	vmov	q15, q11  @ v4si
    812 +.L10:
    813 +	vadd.i32	q15, q15, q9
    814 +	subs	r3, r3, #1
    815 +	veor	q8, q8, q15
    816 +	vrev32.16	q8, q8
    817 +	vadd.i32	q3, q14, q8
    818 +	veor	q9, q3, q9
    819 +	vshl.i32	q14, q9, #12
    820 +	vsri.32	q14, q9, #20
    821 +	vadd.i32	q15, q15, q14
    822 +	veor	q9, q15, q8
    823 +	vshl.i32	q8, q9, #8
    824 +	vsri.32	q8, q9, #24
    825 +	vadd.i32	q9, q3, q8
    826 +	vext.32	q8, q8, q8, #3
    827 +	veor	q2, q9, q14
    828 +	vext.32	q14, q9, q9, #2
    829 +	vshl.i32	q9, q2, #7
    830 +	vsri.32	q9, q2, #25
    831 +	vext.32	q9, q9, q9, #1
    832 +	vadd.i32	q15, q15, q9
    833 +	veor	q3, q15, q8
    834 +	vrev32.16	q3, q3
    835 +	vadd.i32	q14, q14, q3
    836 +	veor	q8, q14, q9
    837 +	vshl.i32	q9, q8, #12
    838 +	vsri.32	q9, q8, #20
    839 +	vadd.i32	q15, q15, q9
    840 +	veor	q3, q15, q3
    841 +	vshl.i32	q8, q3, #8
    842 +	vsri.32	q8, q3, #24
    843 +	vadd.i32	q14, q14, q8
    844 +	vext.32	q8, q8, q8, #1
    845 +	veor	q3, q14, q9
    846 +	vext.32	q14, q14, q14, #2
    847 +	vshl.i32	q9, q3, #7
    848 +	vsri.32	q9, q3, #25
    849 +	vext.32	q9, q9, q9, #3
    850 +	bne	.L10
    851 +	cmp	ip, #15
    852 +	vadd.i32	q11, q11, q15
    853 +	bhi	.L37
    854 +	ldr	r9, [r7, #200]
    855 +	vst1.64	{d22-d23}, [r9:128]
    856 +.L14:
    857 +	ldr	sl, [r7, #196]
    858 +	and	r3, sl, #48
    859 +	cmp	ip, r3
    860 +	bls	.L1
    861 +	adds	r0, r5, r3
    862 +	adds	r1, r4, r3
    863 +	add	r2, r0, #16
    864 +	add	r6, r1, #16
    865 +	cmp	r1, r2
    866 +	it	cc
    867 +	cmpcc	r0, r6
    868 +	rsb	r9, r3, ip
    869 +	ite	cc
    870 +	movcc	r2, #0
    871 +	movcs	r2, #1
    872 +	cmp	r9, #15
    873 +	ite	ls
    874 +	movls	r2, #0
    875 +	andhi	r2, r2, #1
    876 +	lsr	r8, r9, #4
    877 +	eor	r2, r2, #1
    878 +	cmp	r8, #0
    879 +	it	eq
    880 +	orreq	r2, r2, #1
    881 +	lsl	sl, r8, #4
    882 +	cbnz	r2, .L35
    883 +	ldr	fp, [r7, #200]
    884 +	add	r6, fp, r3
    885 +.L17:
    886 +	vld1.8	{q8}, [r0]!
    887 +	adds	r2, r2, #1
    888 +	cmp	r8, r2
    889 +	vld1.8	{q9}, [r6]!
    890 +	veor	q8, q9, q8
    891 +	vst1.8	{q8}, [r1]!
    892 +	bhi	.L17
    893 +	cmp	r9, sl
    894 +	add	r3, r3, sl
    895 +	beq	.L1
    896 +.L35:
    897 +	ldr	r0, [r7, #200]
    898 +.L25:
    899 +	ldrb	r2, [r5, r3]	@ zero_extendqisi2
    900 +	ldrb	r1, [r3, r0]	@ zero_extendqisi2
    901 +	eors	r2, r2, r1
    902 +	strb	r2, [r4, r3]
    903 +	adds	r3, r3, #1
    904 +	cmp	ip, r3
    905 +	bhi	.L25
    906 +.L1:
    907 +	add	r7, r7, #304
    908 +	mov	sp, r7
    909 +	fldmfdd	sp!, {d8, d9, d10, d11, d12, d13, d14, d15}
    910 +	pop	{r4, r5, r6, r7, r8, r9, sl, fp}
    911 +	bx	lr
    912 +.L37:
    913 +	cmp	ip, #31
    914 +	vld1.64	{d0-d1}, [r5:64]
    915 +	vadd.i32	q9, q12, q9
    916 +	veor	q11, q11, q0
    917 +	vst1.64	{d22-d23}, [r4:64]
    918 +	bls	.L12
    919 +	cmp	ip, #47
    920 +	vldr	d2, [r5, #16]
    921 +	vldr	d3, [r5, #24]
    922 +	vadd.i32	q13, q13, q14
    923 +	veor	q9, q9, q1
    924 +	vstr	d18, [r4, #16]
    925 +	vstr	d19, [r4, #24]
    926 +	bls	.L13
    927 +	vadd.i32	q8, q8, q10
    928 +	vldr	d0, [r5, #32]
    929 +	vldr	d1, [r5, #40]
    930 +	ldr	r6, [r7, #200]
    931 +	vstr	d16, [r6, #48]
    932 +	vstr	d17, [r6, #56]
    933 +	veor	q8, q13, q0
    934 +	vstr	d16, [r4, #32]
    935 +	vstr	d17, [r4, #40]
    936 +	b	.L14
    937 +.L12:
    938 +	ldr	r8, [r7, #200]
    939 +	vstr	d18, [r8, #16]
    940 +	vstr	d19, [r8, #24]
    941 +	b	.L14
    942 +.L20:
    943 +	ldr	r5, [r7, #184]
    944 +	ldr	r4, [r7, #188]
    945 +	b	.L2
    946 +.L13:
    947 +	ldr	r6, [r7, #200]
    948 +	vstr	d26, [r6, #32]
    949 +	vstr	d27, [r6, #40]
    950 +	b	.L14
    951 +.L42:
    952 +	.align	3
    953 +.L41:
    954 +	.word	1
    955 +	.word	0
    956 +	.word	0
    957 +	.word	0
    958 +	.size	CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon
    959 +	.section	.rodata
    960 +	.align	3
    961 +.LANCHOR0 = . + 0
    962 +.LC0:
    963 +	.word	1634760805
    964 +	.word	857760878
    965 +	.word	2036477234
    966 +	.word	1797285236
    967 +	.ident	"GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro GCC 2012.10) 4.7.3 20121001 (prerelease)"
    968 +	.section	.note.GNU-stack,"",%progbits
    969 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c
    970 index 7bef015..3b6ab1d 100644
    971 --- a/crypto/cryptlib.c
    972 +++ b/crypto/cryptlib.c
    973 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type)
    974  		return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS));
    975  	}
    976  
    977 +#if __arm__
    978 +static int global_arm_neon_enabled = 0;
    979 +
    980 +void CRYPTO_set_NEON_capable(int on)
    981 +	{
    982 +	global_arm_neon_enabled = on != 0;
    983 +	}
    984 +
    985 +int CRYPTO_is_NEON_capable()
    986 +	{
    987 +	return global_arm_neon_enabled;
    988 +	}
    989 +#endif
    990 +
    991  #if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
    992  	defined(__INTEL__) || \
    993  	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
    994 diff --git a/crypto/crypto.h b/crypto/crypto.h
    995 index e11ac73..db339c3 100644
    996 --- a/crypto/crypto.h
    997 +++ b/crypto/crypto.h
    998 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void);
    999  
   1000  int CRYPTO_get_new_lockid(char *name);
   1001  
   1002 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This
   1003 + * code should be called before any non-init functions. */
   1004 +void CRYPTO_set_NEON_capable(int on);
   1005 +
   1006 +/* CRYPTO_is_NEON_capable returns the last value given to
   1007 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */
   1008 +int CRYPTO_is_NEON_capable();
   1009 +
   1010  int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */
   1011  void CRYPTO_lock(int mode, int type,const char *file,int line);
   1012  void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
   1013 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c
   1014 index 2e5621d..00d53bf 100644
   1015 --- a/crypto/poly1305/poly1305.c
   1016 +++ b/crypto/poly1305/poly1305.c
   1017 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v)
   1018  	}
   1019  #endif
   1020  
   1021 +#if __arm__
   1022 +void CRYPTO_poly1305_init_neon(poly1305_state* state,
   1023 +			       const unsigned char key[32]);
   1024 +
   1025 +void CRYPTO_poly1305_update_neon(poly1305_state* state,
   1026 +				 const unsigned char *in,
   1027 +				 size_t in_len);
   1028 +
   1029 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]);
   1030 +#endif
   1031 +
   1032  static uint64_t
   1033  mul32x32_64(uint32_t a, uint32_t b)
   1034  	{
   1035 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const unsigned char key[32])
   1036  	struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
   1037  	uint32_t t0,t1,t2,t3;
   1038  
   1039 +#if __arm__
   1040 +	if (CRYPTO_is_NEON_capable())
   1041 +		{
   1042 +		CRYPTO_poly1305_init_neon(statep, key);
   1043 +		return;
   1044 +		}
   1045 +#endif
   1046 +
   1047  	t0 = U8TO32_LE(key+0);
   1048  	t1 = U8TO32_LE(key+4);
   1049  	t2 = U8TO32_LE(key+8);
   1050 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const unsigned char *in,
   1051  	unsigned int i;
   1052  	struct poly1305_state_st *state = (struct poly1305_state_st*) statep;
   1053  
   1054 +#if __arm__
   1055 +	if (CRYPTO_is_NEON_capable())
   1056 +		{
   1057 +		CRYPTO_poly1305_update_neon(statep, in, in_len);
   1058 +		return;
   1059 +		}
   1060 +#endif
   1061 +
   1062  	if (state->buf_used)
   1063  		{
   1064  		unsigned int todo = 16 - state->buf_used;
   1065 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigned char mac[16])
   1066  	uint32_t g0,g1,g2,g3,g4;
   1067  	uint32_t b, nb;
   1068  
   1069 +#if __arm__
   1070 +	if (CRYPTO_is_NEON_capable())
   1071 +		{
   1072 +		CRYPTO_poly1305_finish_neon(statep, mac);
   1073 +		return;
   1074 +		}
   1075 +#endif
   1076 +
   1077  	if (state->buf_used)
   1078  		poly1305_update(state, state->buf, state->buf_used);
   1079  
   1080 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c
   1081 index adcef35..34e339d 100644
   1082 --- a/crypto/poly1305/poly1305_arm.c
   1083 +++ b/crypto/poly1305/poly1305_arm.c
   1084 @@ -51,6 +51,7 @@
   1085   * SUPERCOP by D. J. Bernstein and Peter Schwabe. */
   1086  
   1087  #include <stdint.h>
   1088 +#include <string.h>
   1089  
   1090  #include <openssl/poly1305.h>
   1091  
   1092 @@ -202,7 +203,8 @@ struct poly1305_state_st {
   1093  	unsigned char key[16];
   1094  };
   1095  
   1096 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32])
   1097 +void CRYPTO_poly1305_init_neon(poly1305_state *state,
   1098 +			       const unsigned char key[32])
   1099  	{
   1100  	struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
   1101  	fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
   1102 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32])
   1103  	st->buf_used = 0;
   1104  	}
   1105  
   1106 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, size_t in_len)
   1107 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in,
   1108 +				 size_t in_len)
   1109  	{
   1110  	struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
   1111  	fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
   1112 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, size
   1113  		}
   1114  	}
   1115  
   1116 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16])
   1117 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16])
   1118  	{
   1119  	struct poly1305_state_st *st = (struct poly1305_state_st*) (state);
   1120  	fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data)));
   1121 -- 
   1122 1.8.4.1
   1123 
   1124