1 From aea47606333cfd3e7a09cab3e42e488c79a416af Mon Sep 17 00:00:00 2001 2 From: Adam Langley <agl (a] chromium.org> 3 Date: Tue, 5 Nov 2013 13:10:11 -0500 4 Subject: [PATCH 52/52] Optional NEON support on ARM. 5 6 This patch causes ARM to build both the NEON and generic versions of 7 ChaCha20 and Poly1305. The NEON code can be enabled at run-time by 8 calling CRYPTO_set_NEON_capable(1). 9 --- 10 .gitignore | 1 + 11 Configure | 2 +- 12 apps/speed.c | 5 + 13 crypto/chacha/chacha_enc.c | 18 + 14 crypto/chacha/chacha_vec.c | 7 + 15 crypto/chacha/chacha_vec_arm.s | 846 +++++++++++++++++++++++++++++++++++++++++ 16 crypto/cryptlib.c | 14 + 17 crypto/crypto.h | 8 + 18 crypto/poly1305/poly1305.c | 35 ++ 19 crypto/poly1305/poly1305_arm.c | 9 +- 20 10 files changed, 941 insertions(+), 4 deletions(-) 21 create mode 100644 crypto/chacha/chacha_vec_arm.s 22 23 diff --git a/Configure b/Configure 24 index 1b95384..18b7af0 100755 25 --- a/Configure 26 +++ b/Configure 27 @@ -136,7 +136,7 @@ my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-a 28 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::::"; 29 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::::"; 30 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes-s390x.o aes-ctr.o aes-xts.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::::ghash-s390x.o:"; 31 -my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::chacha_vec.o:poly1305_arm.o poly1305_arm_asm.o:void"; 32 +my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::chacha_vec_arm.o chacha_enc.o:poly1305.o poly1305_arm.o poly1305_arm_asm.o:void"; 33 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::32"; 34 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::::64"; 35 my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::::"; 36 diff --git a/crypto/chacha/chacha_enc.c b/crypto/chacha/chacha_enc.c 37 index 54d1ca3..e4b648f 100644 38 --- a/crypto/chacha/chacha_enc.c 39 +++ b/crypto/chacha/chacha_enc.c 40 @@ -61,6 +61,7 @@ 41 42 #if !defined(OPENSSL_NO_CHACHA) 43 44 +#include <openssl/crypto.h> 45 #include <openssl/chacha.h> 46 47 /* sigma contains the ChaCha constants, which happen to be an ASCII string. */ 48 @@ -87,6 +88,15 @@ static const char sigma[16] = "expand 32-byte k"; 49 50 typedef unsigned int uint32_t; 51 52 +#if __arm__ 53 +/* Defined in chacha_vec.c */ 54 +void CRYPTO_chacha_20_neon(unsigned char *out, 55 + const unsigned char *in, size_t in_len, 56 + const unsigned char key[32], 57 + const unsigned char nonce[8], 58 + size_t counter); 59 +#endif 60 + 61 /* chacha_core performs |num_rounds| rounds of ChaCha20 on the input words in 62 * |input| and writes the 64 output bytes to |output|. */ 63 static void chacha_core(unsigned char output[64], const uint32_t input[16], 64 @@ -124,6 +134,16 @@ void CRYPTO_chacha_20(unsigned char *out, 65 unsigned char buf[64]; 66 size_t todo, i; 67 68 +#if __arm__ 69 + if (CRYPTO_is_NEON_capable() && 70 + ((intptr_t)in & 15) == 0 && 71 + ((intptr_t)out & 15) == 0) 72 + { 73 + CRYPTO_chacha_20_neon(out, in, in_len, key, nonce, counter); 74 + return; 75 + } 76 +#endif 77 + 78 input[0] = U8TO32_LITTLE(sigma + 0); 79 input[1] = U8TO32_LITTLE(sigma + 4); 80 input[2] = U8TO32_LITTLE(sigma + 8); 81 diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c 82 index 33b2238..1226c39 100644 83 --- a/crypto/chacha/chacha_vec.c 84 +++ b/crypto/chacha/chacha_vec.c 85 @@ -154,7 +154,14 @@ typedef unsigned vec __attribute__ ((vector_size (16))); 86 STORE(op + d + 8, LOAD(in + d + 8) ^ REVV_BE(v2)); \ 87 STORE(op + d +12, LOAD(in + d +12) ^ REVV_BE(v3)); 88 89 +#if __ARM_NEON__ 90 +/* For ARM, we can't depend on NEON support, so this function is compiled with 91 + * a different name, along with the generic code, and can be enabled at 92 + * run-time. */ 93 +void CRYPTO_chacha_20_neon( 94 +#else 95 void CRYPTO_chacha_20( 96 +#endif 97 unsigned char *out, 98 const unsigned char *in, 99 size_t inlen, 100 diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S 101 new file mode 100644 102 index 0000000..24a5050 103 --- /dev/null 104 +++ b/crypto/chacha/chacha_vec_arm.S 105 @@ -0,0 +1,863 @@ 106 +# This file contains a pre-compiled version of chacha_vec.c for ARM. This is 107 +# needed to support switching on NEON code at runtime. If the whole of OpenSSL 108 +# were to be compiled with the needed flags to build chacha_vec.c, then it 109 +# wouldn't be possible to run on non-NEON systems. 110 +# 111 +# This file was generated by: 112 +# 113 +# /opt/gcc-linaro-arm-linux-gnueabihf-4.7-2012.10-20121022_linux/bin/arm-linux-gnueabihf-gcc -O3 -mcpu=cortex-a8 -mfpu=neon -S chacha_vec.c -I ../../include -fpic -o chacha_vec_arm.S 114 +# 115 +# And then EABI attribute 28 was set to zero to allow linking with soft-float 116 +# code. 117 + 118 + .syntax unified 119 + .cpu cortex-a8 120 + .eabi_attribute 27, 3 121 + .eabi_attribute 28, 0 122 + .fpu neon 123 + .eabi_attribute 20, 1 124 + .eabi_attribute 21, 1 125 + .eabi_attribute 23, 3 126 + .eabi_attribute 24, 1 127 + .eabi_attribute 25, 1 128 + .eabi_attribute 26, 2 129 + .eabi_attribute 30, 2 130 + .eabi_attribute 34, 1 131 + .eabi_attribute 18, 4 132 + .thumb 133 + .file "chacha_vec.c" 134 + .text 135 + .align 2 136 + .global CRYPTO_chacha_20_neon 137 + .thumb 138 + .thumb_func 139 + .type CRYPTO_chacha_20_neon, %function 140 +CRYPTO_chacha_20_neon: 141 + @ args = 8, pretend = 0, frame = 304 142 + @ frame_needed = 1, uses_anonymous_args = 0 143 + @ link register save eliminated. 144 + push {r4, r5, r6, r7, r8, r9, sl, fp} 145 + fstmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} 146 + sub sp, sp, #304 147 + add r7, sp, #0 148 + movw ip, #43691 149 + movt ip, 43690 150 + str r2, [r7, #196] 151 + sub sp, sp, #96 152 + ldr r4, [r7, #196] 153 + ldr r6, [r7, #400] 154 + ldr r2, .L38+16 155 + umull r4, ip, ip, r4 156 + ldr r6, [r6, #0] 157 + ldr r8, [r7, #400] 158 +.LPIC24: 159 + add r2, pc 160 + add r4, sp, #15 161 + str r3, [r7, #244] 162 + str r6, [r7, #176] 163 + bic r4, r4, #15 164 + str r0, [r7, #188] 165 + str r4, [r7, #200] 166 + lsrs ip, ip, #7 167 + str r1, [r7, #184] 168 + ldmia r2, {r0, r1, r2, r3} 169 + ldr r4, [r8, #4] 170 + ldr r5, [r7, #244] 171 + vld1.64 {d24-d25}, [r5:64] 172 + vldr d26, [r5, #16] 173 + vldr d27, [r5, #24] 174 + ldr r9, [r7, #200] 175 + ldr r8, [r7, #404] 176 + ldr r5, [r7, #176] 177 + add r6, r9, #64 178 + str r4, [r7, #300] 179 + mov r4, #0 180 + str r8, [r7, #288] 181 + str r5, [r7, #296] 182 + str r4, [r7, #292] 183 + stmia r6, {r0, r1, r2, r3} 184 + vldr d22, [r9, #64] 185 + vldr d23, [r9, #72] 186 + vldr d20, [r7, #288] 187 + vldr d21, [r7, #296] 188 + str ip, [r7, #192] 189 + beq .L20 190 + lsl r6, ip, #1 191 + ldr r1, [r9, #68] 192 + add r3, r6, ip 193 + str r6, [r7, #180] 194 + ldr r2, [r9, #72] 195 + add r8, r8, #2 196 + ldr r5, [r9, #76] 197 + vldr d18, .L38 198 + vldr d19, .L38+8 199 + str r4, [r7, #240] 200 + ldr r6, [r7, #184] 201 + ldr r4, [r7, #188] 202 + str r0, [r7, #224] 203 + str r1, [r7, #220] 204 + str r8, [r7, #208] 205 + str r2, [r7, #216] 206 + str r3, [r7, #204] 207 + str r5, [r7, #212] 208 + str r6, [r7, #252] 209 + str r4, [r7, #248] 210 +.L4: 211 + ldr r2, [r7, #244] 212 + add r9, r7, #216 213 + ldr r3, [r7, #244] 214 + vadd.i32 q8, q10, q9 215 + ldr r6, [r7, #208] 216 + vmov q15, q13 @ v4si 217 + ldr r5, [r7, #240] 218 + vmov q3, q12 @ v4si 219 + ldr r4, [r7, #244] 220 + vmov q2, q11 @ v4si 221 + adds r5, r5, r6 222 + ldr r2, [r2, #8] 223 + ldr r6, [r7, #400] 224 + vmov q5, q10 @ v4si 225 + ldr r3, [r3, #12] 226 + vmov q1, q13 @ v4si 227 + ldr r0, [r7, #244] 228 + vmov q0, q12 @ v4si 229 + ldr r1, [r7, #244] 230 + vmov q4, q11 @ v4si 231 + ldmia r9, {r9, sl, fp} 232 + str r5, [r7, #228] 233 + ldr r5, [r4, #24] 234 + ldr r0, [r0, #0] 235 + ldr r1, [r1, #4] 236 + str r2, [r7, #264] 237 + str r3, [r7, #236] 238 + ldr r2, [r6, #4] 239 + ldr r3, [r4, #28] 240 + str r5, [r7, #280] 241 + ldr r5, [r6, #0] 242 + movs r6, #0 243 + ldr ip, [r7, #228] 244 + ldr r8, [r7, #212] 245 + str r0, [r7, #232] 246 + str r1, [r7, #268] 247 + ldr r0, [r4, #16] 248 + ldr r1, [r4, #20] 249 + movs r4, #10 250 + str r2, [r7, #24] 251 + str r3, [r7, #284] 252 + str r4, [r7, #256] 253 + ldr r2, [r7, #264] 254 + str r9, [r7, #276] 255 + mov r9, r6 256 + ldr r6, [r7, #280] 257 + str r8, [r7, #260] 258 + mov r8, sl 259 + str r1, [r7, #272] 260 + mov sl, ip 261 + str r6, [r7, #264] 262 + mov r6, r5 263 + ldr r3, [r7, #236] 264 + mov r5, r0 265 + ldr ip, [r7, #24] 266 + ldr r1, [r7, #268] 267 + ldr r0, [r7, #232] 268 + b .L39 269 +.L40: 270 + .align 3 271 +.L38: 272 + .word 1 273 + .word 0 274 + .word 0 275 + .word 0 276 + .word .LANCHOR0-(.LPIC24+4) 277 +.L39: 278 +.L3: 279 + vadd.i32 q4, q4, q0 280 + add r8, r8, r1 281 + vadd.i32 q2, q2, q3 282 + str r8, [r7, #268] 283 + veor q5, q5, q4 284 + ldr r8, [r7, #276] 285 + veor q8, q8, q2 286 + add fp, fp, r0 287 + str fp, [r7, #280] 288 + add r8, r8, r2 289 + vrev32.16 q5, q5 290 + str r8, [r7, #276] 291 + vrev32.16 q8, q8 292 + vadd.i32 q1, q1, q5 293 + vadd.i32 q15, q15, q8 294 + ldr r8, [r7, #280] 295 + veor q0, q1, q0 296 + ldr r4, [r7, #260] 297 + veor q3, q15, q3 298 + eor sl, sl, r8 299 + ldr r8, [r7, #276] 300 + add fp, r4, r3 301 + vshl.i32 q7, q0, #12 302 + ldr r4, [r7, #268] 303 + vshl.i32 q6, q3, #12 304 + eor r6, r6, r8 305 + eor r9, r9, r4 306 + ldr r4, [r7, #272] 307 + vsri.32 q7, q0, #20 308 + ror r8, r6, #16 309 + ldr r6, [r7, #264] 310 + eor ip, ip, fp 311 + vsri.32 q6, q3, #20 312 + ror sl, sl, #16 313 + ror r9, r9, #16 314 + add r5, r5, sl 315 + vadd.i32 q4, q4, q7 316 + str r5, [r7, #236] 317 + vadd.i32 q2, q2, q6 318 + add r5, r4, r9 319 + add r4, r6, r8 320 + ldr r6, [r7, #284] 321 + ror ip, ip, #16 322 + veor q5, q4, q5 323 + veor q8, q2, q8 324 + add r6, r6, ip 325 + str r6, [r7, #264] 326 + eors r1, r1, r5 327 + ldr r6, [r7, #236] 328 + vshl.i32 q3, q5, #8 329 + vshl.i32 q14, q8, #8 330 + eors r2, r2, r4 331 + eors r0, r0, r6 332 + ldr r6, [r7, #264] 333 + vsri.32 q3, q5, #24 334 + ror r1, r1, #20 335 + eors r3, r3, r6 336 + ldr r6, [r7, #280] 337 + ror r0, r0, #20 338 + vsri.32 q14, q8, #24 339 + adds r6, r0, r6 340 + str r6, [r7, #284] 341 + ldr r6, [r7, #268] 342 + vadd.i32 q1, q1, q3 343 + vadd.i32 q15, q15, q14 344 + ror r2, r2, #20 345 + adds r6, r1, r6 346 + str r6, [r7, #260] 347 + ldr r6, [r7, #276] 348 + veor q6, q15, q6 349 + veor q7, q1, q7 350 + ror r3, r3, #20 351 + adds r6, r2, r6 352 + str r6, [r7, #280] 353 + ldr r6, [r7, #284] 354 + vshl.i32 q0, q6, #7 355 + vshl.i32 q5, q7, #7 356 + add fp, r3, fp 357 + eor sl, r6, sl 358 + ldr r6, [r7, #260] 359 + eor ip, fp, ip 360 + vsri.32 q0, q6, #25 361 + eor r9, r6, r9 362 + ldr r6, [r7, #280] 363 + ror sl, sl, #24 364 + vsri.32 q5, q7, #25 365 + eor r8, r6, r8 366 + ldr r6, [r7, #236] 367 + ror r9, r9, #24 368 + ror ip, ip, #24 369 + add r6, sl, r6 370 + str r6, [r7, #276] 371 + ldr r6, [r7, #264] 372 + add r5, r9, r5 373 + str r5, [r7, #272] 374 + vext.32 q5, q5, q5, #1 375 + add r5, ip, r6 376 + ldr r6, [r7, #276] 377 + vext.32 q0, q0, q0, #1 378 + vadd.i32 q4, q4, q5 379 + eors r0, r0, r6 380 + ldr r6, [r7, #272] 381 + vadd.i32 q2, q2, q0 382 + vext.32 q3, q3, q3, #3 383 + ror r8, r8, #24 384 + eors r1, r1, r6 385 + vext.32 q14, q14, q14, #3 386 + add r4, r8, r4 387 + ldr r6, [r7, #284] 388 + veor q3, q4, q3 389 + veor q14, q2, q14 390 + eors r2, r2, r4 391 + ror r1, r1, #25 392 + vext.32 q1, q1, q1, #2 393 + adds r6, r1, r6 394 + str r6, [r7, #284] 395 + vext.32 q15, q15, q15, #2 396 + ldr r6, [r7, #260] 397 + eors r3, r3, r5 398 + ror r2, r2, #25 399 + vrev32.16 q8, q14 400 + adds r6, r2, r6 401 + vrev32.16 q3, q3 402 + str r6, [r7, #268] 403 + vadd.i32 q1, q1, q3 404 + ldr r6, [r7, #280] 405 + vadd.i32 q15, q15, q8 406 + ror r3, r3, #25 407 + veor q5, q1, q5 408 + adds r6, r3, r6 409 + veor q0, q15, q0 410 + str r6, [r7, #264] 411 + ldr r6, [r7, #268] 412 + ror r0, r0, #25 413 + add fp, r0, fp 414 + vshl.i32 q6, q5, #12 415 + eor sl, r6, sl 416 + ldr r6, [r7, #284] 417 + vshl.i32 q14, q0, #12 418 + eor r8, fp, r8 419 + eor ip, r6, ip 420 + ldr r6, [r7, #264] 421 + vsri.32 q6, q5, #20 422 + ror sl, sl, #16 423 + eor r9, r6, r9 424 + ror r6, r8, #16 425 + vsri.32 q14, q0, #20 426 + ldr r8, [r7, #272] 427 + ror ip, ip, #16 428 + add r5, sl, r5 429 + add r8, r6, r8 430 + add r4, ip, r4 431 + str r4, [r7, #236] 432 + eor r0, r8, r0 433 + str r5, [r7, #280] 434 + vadd.i32 q4, q4, q6 435 + ldr r5, [r7, #236] 436 + vadd.i32 q2, q2, q14 437 + ldr r4, [r7, #276] 438 + ror r0, r0, #20 439 + veor q3, q4, q3 440 + eors r1, r1, r5 441 + veor q0, q2, q8 442 + str r8, [r7, #272] 443 + str r0, [r7, #24] 444 + add fp, r0, fp 445 + ldr r8, [r7, #280] 446 + ror r9, r9, #16 447 + ldr r0, [r7, #284] 448 + add r4, r9, r4 449 + str fp, [r7, #260] 450 + ror r1, r1, #20 451 + add fp, r1, r0 452 + eor r2, r8, r2 453 + ldr r0, [r7, #260] 454 + eors r3, r3, r4 455 + vshl.i32 q5, q3, #8 456 + str r4, [r7, #232] 457 + vshl.i32 q8, q0, #8 458 + ldr r4, [r7, #268] 459 + ldr r5, [r7, #264] 460 + ror r2, r2, #20 461 + ror r3, r3, #20 462 + eors r6, r6, r0 463 + adds r5, r3, r5 464 + add r8, r2, r4 465 + vsri.32 q5, q3, #24 466 + ldr r4, [r7, #272] 467 + eor r9, r5, r9 468 + eor ip, fp, ip 469 + vsri.32 q8, q0, #24 470 + eor sl, r8, sl 471 + ror r6, r6, #24 472 + ldr r0, [r7, #280] 473 + str r5, [r7, #276] 474 + adds r4, r6, r4 475 + ldr r5, [r7, #236] 476 + vadd.i32 q1, q1, q5 477 + str r4, [r7, #272] 478 + vadd.i32 q15, q15, q8 479 + ldr r4, [r7, #232] 480 + ror ip, ip, #24 481 + ror sl, sl, #24 482 + ror r9, r9, #24 483 + add r5, ip, r5 484 + add r0, sl, r0 485 + str r5, [r7, #264] 486 + add r5, r9, r4 487 + str r0, [r7, #284] 488 + veor q6, q1, q6 489 + ldr r4, [r7, #24] 490 + veor q14, q15, q14 491 + ldr r0, [r7, #272] 492 + eors r3, r3, r5 493 + vshl.i32 q0, q6, #7 494 + vext.32 q1, q1, q1, #2 495 + eors r0, r0, r4 496 + ldr r4, [r7, #284] 497 + str r0, [r7, #280] 498 + vshl.i32 q3, q14, #7 499 + eors r2, r2, r4 500 + ldr r4, [r7, #280] 501 + ldr r0, [r7, #264] 502 + vsri.32 q0, q6, #25 503 + ror r2, r2, #25 504 + ror r3, r3, #25 505 + eors r1, r1, r0 506 + vsri.32 q3, q14, #25 507 + ror r0, r4, #25 508 + ldr r4, [r7, #256] 509 + ror r1, r1, #25 510 + vext.32 q5, q5, q5, #1 511 + subs r4, r4, #1 512 + str r4, [r7, #256] 513 + vext.32 q15, q15, q15, #2 514 + vext.32 q8, q8, q8, #1 515 + vext.32 q0, q0, q0, #3 516 + vext.32 q3, q3, q3, #3 517 + bne .L3 518 + ldr r4, [r7, #264] 519 + vadd.i32 q14, q10, q9 520 + str r2, [r7, #264] 521 + vadd.i32 q10, q10, q5 522 + ldr r2, [r7, #252] 523 + vld1.64 {d12-d13}, [r2:64] 524 + ldr r2, [r7, #220] 525 + vadd.i32 q4, q11, q4 526 + str ip, [r7, #24] 527 + mov ip, sl 528 + mov sl, r8 529 + ldr r8, [r7, #260] 530 + add sl, sl, r2 531 + ldr r2, [r7, #212] 532 + str r4, [r7, #280] 533 + vadd.i32 q0, q12, q0 534 + ldr r4, [r7, #224] 535 + add r8, r8, r2 536 + ldr r2, [r7, #240] 537 + vadd.i32 q1, q13, q1 538 + str r0, [r7, #232] 539 + add fp, fp, r4 540 + mov r0, r5 541 + ldr r4, [r7, #216] 542 + mov r5, r6 543 + mov r6, r9 544 + ldr r9, [r7, #276] 545 + adds r2, r2, #3 546 + str r2, [r7, #240] 547 + vadd.i32 q2, q11, q2 548 + ldr r2, [r7, #252] 549 + add r9, r9, r4 550 + vadd.i32 q3, q12, q3 551 + ldr r4, [r7, #228] 552 + vadd.i32 q15, q13, q15 553 + str r1, [r7, #268] 554 + vadd.i32 q8, q14, q8 555 + str r3, [r7, #236] 556 + veor q4, q4, q6 557 + ldr r3, [r7, #284] 558 + ldr r1, [r7, #272] 559 + add ip, r4, ip 560 + ldr r4, [r7, #248] 561 + vst1.64 {d8-d9}, [r4:64] 562 + vldr d8, [r2, #16] 563 + vldr d9, [r2, #24] 564 + veor q0, q0, q4 565 + vstr d0, [r4, #16] 566 + vstr d1, [r4, #24] 567 + vldr d0, [r2, #32] 568 + vldr d1, [r2, #40] 569 + veor q1, q1, q0 570 + vstr d2, [r4, #32] 571 + vstr d3, [r4, #40] 572 + vldr d2, [r2, #48] 573 + vldr d3, [r2, #56] 574 + veor q10, q10, q1 575 + vstr d20, [r4, #48] 576 + vstr d21, [r4, #56] 577 + vldr d8, [r2, #64] 578 + vldr d9, [r2, #72] 579 + veor q2, q2, q4 580 + vstr d4, [r4, #64] 581 + vstr d5, [r4, #72] 582 + vldr d10, [r2, #80] 583 + vldr d11, [r2, #88] 584 + veor q3, q3, q5 585 + vstr d6, [r4, #80] 586 + vstr d7, [r4, #88] 587 + vldr d12, [r2, #96] 588 + vldr d13, [r2, #104] 589 + veor q15, q15, q6 590 + vstr d30, [r4, #96] 591 + vstr d31, [r4, #104] 592 + vldr d20, [r2, #112] 593 + vldr d21, [r2, #120] 594 + veor q8, q8, q10 595 + vstr d16, [r4, #112] 596 + vstr d17, [r4, #120] 597 + ldr r4, [r2, #128] 598 + ldr r2, [r7, #248] 599 + vadd.i32 q10, q14, q9 600 + eor r4, fp, r4 601 + vadd.i32 q10, q10, q9 602 + str r4, [r2, #128] 603 + ldr r4, [r7, #252] 604 + ldr r2, [r4, #132] 605 + eor r2, sl, r2 606 + ldr sl, [r7, #248] 607 + str r2, [sl, #132] 608 + ldr r2, [r4, #136] 609 + eor r2, r9, r2 610 + str r2, [sl, #136] 611 + ldr r2, [r4, #140] 612 + eor r2, r8, r2 613 + str r2, [sl, #140] 614 + ldr r2, [r7, #244] 615 + ldr r4, [r4, #144] 616 + ldr r2, [r2, #0] 617 + str r4, [r7, #44] 618 + ldr r4, [r7, #232] 619 + add r8, r4, r2 620 + ldr r2, [r7, #44] 621 + ldr r4, [r7, #244] 622 + eor r8, r8, r2 623 + ldr r2, [r7, #252] 624 + str r8, [sl, #144] 625 + ldr r4, [r4, #4] 626 + ldr r2, [r2, #148] 627 + str r2, [r7, #40] 628 + ldr r2, [r7, #268] 629 + add r8, r2, r4 630 + ldr r4, [r7, #40] 631 + ldr r2, [r7, #244] 632 + eor r8, r8, r4 633 + ldr r4, [r7, #252] 634 + str r8, [sl, #148] 635 + ldr r2, [r2, #8] 636 + ldr r4, [r4, #152] 637 + str r4, [r7, #36] 638 + ldr r4, [r7, #264] 639 + add r8, r4, r2 640 + ldr r2, [r7, #36] 641 + eor r8, r8, r2 642 + str r8, [sl, #152] 643 + ldr r2, [r7, #252] 644 + ldr r4, [r7, #244] 645 + ldr r2, [r2, #156] 646 + ldr r4, [r4, #12] 647 + str r2, [r7, #32] 648 + ldr r2, [r7, #236] 649 + add r8, r2, r4 650 + ldr r4, [r7, #32] 651 + ldr r2, [r7, #252] 652 + eor r8, r8, r4 653 + str r8, [sl, #156] 654 + ldr r8, [r7, #244] 655 + ldr r2, [r2, #160] 656 + ldr r4, [r8, #16] 657 + adds r0, r0, r4 658 + ldr r4, [r7, #252] 659 + eors r0, r0, r2 660 + str r0, [sl, #160] 661 + ldr r0, [r8, #20] 662 + ldr r2, [r4, #164] 663 + adds r1, r1, r0 664 + ldr r0, [r7, #280] 665 + eors r1, r1, r2 666 + str r1, [sl, #164] 667 + ldr r2, [r8, #24] 668 + ldr r1, [r4, #168] 669 + adds r2, r0, r2 670 + eors r2, r2, r1 671 + str r2, [sl, #168] 672 + ldr r1, [r8, #28] 673 + ldr r2, [r4, #172] 674 + adds r3, r3, r1 675 + eors r3, r3, r2 676 + str r3, [sl, #172] 677 + ldr r3, [r4, #176] 678 + eor r3, ip, r3 679 + str r3, [sl, #176] 680 + ldr r3, [r4, #180] 681 + ldr r4, [r7, #400] 682 + eors r6, r6, r3 683 + str r6, [sl, #180] 684 + ldr r6, [r7, #252] 685 + ldr r2, [r4, #0] 686 + ldr r3, [r6, #184] 687 + adds r5, r5, r2 688 + eors r5, r5, r3 689 + str r5, [sl, #184] 690 + ldr r2, [r6, #188] 691 + adds r6, r6, #192 692 + ldr r3, [r4, #4] 693 + str r6, [r7, #252] 694 + ldr r0, [r7, #24] 695 + ldr r1, [r7, #240] 696 + adds r4, r0, r3 697 + eors r4, r4, r2 698 + ldr r2, [r7, #204] 699 + str r4, [sl, #188] 700 + add sl, sl, #192 701 + cmp r1, r2 702 + str sl, [r7, #248] 703 + bne .L4 704 + ldr r4, [r7, #192] 705 + ldr r3, [r7, #180] 706 + ldr r6, [r7, #188] 707 + adds r5, r3, r4 708 + ldr r8, [r7, #184] 709 + lsls r5, r5, #6 710 + adds r4, r6, r5 711 + add r5, r8, r5 712 +.L2: 713 + ldr r9, [r7, #196] 714 + movw r3, #43691 715 + movt r3, 43690 716 + ldr sl, [r7, #196] 717 + umull r9, r3, r3, r9 718 + lsrs r3, r3, #7 719 + add r3, r3, r3, lsl #1 720 + sub r3, sl, r3, lsl #6 721 + lsrs r6, r3, #6 722 + beq .L5 723 + add r1, r5, #16 724 + add r2, r4, #16 725 + mov r0, r6 726 + vldr d30, .L41 727 + vldr d31, .L41+8 728 +.L6: 729 + vmov q8, q10 @ v4si 730 + movs r3, #10 731 + vmov q1, q13 @ v4si 732 + vmov q14, q12 @ v4si 733 + vmov q3, q11 @ v4si 734 +.L7: 735 + vadd.i32 q3, q3, q14 736 + subs r3, r3, #1 737 + veor q2, q8, q3 738 + vrev32.16 q2, q2 739 + vadd.i32 q8, q1, q2 740 + veor q9, q8, q14 741 + vshl.i32 q14, q9, #12 742 + vsri.32 q14, q9, #20 743 + vadd.i32 q3, q3, q14 744 + veor q2, q3, q2 745 + vshl.i32 q9, q2, #8 746 + vsri.32 q9, q2, #24 747 + vadd.i32 q8, q8, q9 748 + vext.32 q9, q9, q9, #3 749 + veor q14, q8, q14 750 + vext.32 q1, q8, q8, #2 751 + vshl.i32 q8, q14, #7 752 + vsri.32 q8, q14, #25 753 + vext.32 q8, q8, q8, #1 754 + vadd.i32 q3, q3, q8 755 + veor q2, q3, q9 756 + vrev32.16 q2, q2 757 + vadd.i32 q9, q1, q2 758 + veor q8, q9, q8 759 + vshl.i32 q14, q8, #12 760 + vsri.32 q14, q8, #20 761 + vadd.i32 q3, q3, q14 762 + veor q2, q3, q2 763 + vshl.i32 q8, q2, #8 764 + vsri.32 q8, q2, #24 765 + vadd.i32 q9, q9, q8 766 + vext.32 q8, q8, q8, #1 767 + veor q14, q9, q14 768 + vext.32 q1, q9, q9, #2 769 + vshl.i32 q9, q14, #7 770 + vsri.32 q9, q14, #25 771 + vext.32 q14, q9, q9, #3 772 + bne .L7 773 + vadd.i32 q8, q10, q8 774 + subs r0, r0, #1 775 + vadd.i32 q3, q11, q3 776 + vldr d0, [r1, #-16] 777 + vldr d1, [r1, #-8] 778 + vadd.i32 q14, q12, q14 779 + vadd.i32 q1, q13, q1 780 + veor q3, q3, q0 781 + vstr d6, [r2, #-16] 782 + vstr d7, [r2, #-8] 783 + vadd.i32 q10, q10, q15 784 + vld1.64 {d8-d9}, [r1:64] 785 + veor q14, q14, q4 786 + vst1.64 {d28-d29}, [r2:64] 787 + vldr d10, [r1, #16] 788 + vldr d11, [r1, #24] 789 + veor q1, q1, q5 790 + vstr d2, [r2, #16] 791 + vstr d3, [r2, #24] 792 + vldr d18, [r1, #32] 793 + vldr d19, [r1, #40] 794 + add r1, r1, #64 795 + veor q8, q8, q9 796 + vstr d16, [r2, #32] 797 + vstr d17, [r2, #40] 798 + add r2, r2, #64 799 + bne .L6 800 + lsls r6, r6, #6 801 + adds r4, r4, r6 802 + adds r5, r5, r6 803 +.L5: 804 + ldr r6, [r7, #196] 805 + ands ip, r6, #63 806 + beq .L1 807 + vmov q8, q10 @ v4si 808 + movs r3, #10 809 + vmov q14, q13 @ v4si 810 + vmov q9, q12 @ v4si 811 + vmov q15, q11 @ v4si 812 +.L10: 813 + vadd.i32 q15, q15, q9 814 + subs r3, r3, #1 815 + veor q8, q8, q15 816 + vrev32.16 q8, q8 817 + vadd.i32 q3, q14, q8 818 + veor q9, q3, q9 819 + vshl.i32 q14, q9, #12 820 + vsri.32 q14, q9, #20 821 + vadd.i32 q15, q15, q14 822 + veor q9, q15, q8 823 + vshl.i32 q8, q9, #8 824 + vsri.32 q8, q9, #24 825 + vadd.i32 q9, q3, q8 826 + vext.32 q8, q8, q8, #3 827 + veor q2, q9, q14 828 + vext.32 q14, q9, q9, #2 829 + vshl.i32 q9, q2, #7 830 + vsri.32 q9, q2, #25 831 + vext.32 q9, q9, q9, #1 832 + vadd.i32 q15, q15, q9 833 + veor q3, q15, q8 834 + vrev32.16 q3, q3 835 + vadd.i32 q14, q14, q3 836 + veor q8, q14, q9 837 + vshl.i32 q9, q8, #12 838 + vsri.32 q9, q8, #20 839 + vadd.i32 q15, q15, q9 840 + veor q3, q15, q3 841 + vshl.i32 q8, q3, #8 842 + vsri.32 q8, q3, #24 843 + vadd.i32 q14, q14, q8 844 + vext.32 q8, q8, q8, #1 845 + veor q3, q14, q9 846 + vext.32 q14, q14, q14, #2 847 + vshl.i32 q9, q3, #7 848 + vsri.32 q9, q3, #25 849 + vext.32 q9, q9, q9, #3 850 + bne .L10 851 + cmp ip, #15 852 + vadd.i32 q11, q11, q15 853 + bhi .L37 854 + ldr r9, [r7, #200] 855 + vst1.64 {d22-d23}, [r9:128] 856 +.L14: 857 + ldr sl, [r7, #196] 858 + and r3, sl, #48 859 + cmp ip, r3 860 + bls .L1 861 + adds r0, r5, r3 862 + adds r1, r4, r3 863 + add r2, r0, #16 864 + add r6, r1, #16 865 + cmp r1, r2 866 + it cc 867 + cmpcc r0, r6 868 + rsb r9, r3, ip 869 + ite cc 870 + movcc r2, #0 871 + movcs r2, #1 872 + cmp r9, #15 873 + ite ls 874 + movls r2, #0 875 + andhi r2, r2, #1 876 + lsr r8, r9, #4 877 + eor r2, r2, #1 878 + cmp r8, #0 879 + it eq 880 + orreq r2, r2, #1 881 + lsl sl, r8, #4 882 + cbnz r2, .L35 883 + ldr fp, [r7, #200] 884 + add r6, fp, r3 885 +.L17: 886 + vld1.8 {q8}, [r0]! 887 + adds r2, r2, #1 888 + cmp r8, r2 889 + vld1.8 {q9}, [r6]! 890 + veor q8, q9, q8 891 + vst1.8 {q8}, [r1]! 892 + bhi .L17 893 + cmp r9, sl 894 + add r3, r3, sl 895 + beq .L1 896 +.L35: 897 + ldr r0, [r7, #200] 898 +.L25: 899 + ldrb r2, [r5, r3] @ zero_extendqisi2 900 + ldrb r1, [r3, r0] @ zero_extendqisi2 901 + eors r2, r2, r1 902 + strb r2, [r4, r3] 903 + adds r3, r3, #1 904 + cmp ip, r3 905 + bhi .L25 906 +.L1: 907 + add r7, r7, #304 908 + mov sp, r7 909 + fldmfdd sp!, {d8, d9, d10, d11, d12, d13, d14, d15} 910 + pop {r4, r5, r6, r7, r8, r9, sl, fp} 911 + bx lr 912 +.L37: 913 + cmp ip, #31 914 + vld1.64 {d0-d1}, [r5:64] 915 + vadd.i32 q9, q12, q9 916 + veor q11, q11, q0 917 + vst1.64 {d22-d23}, [r4:64] 918 + bls .L12 919 + cmp ip, #47 920 + vldr d2, [r5, #16] 921 + vldr d3, [r5, #24] 922 + vadd.i32 q13, q13, q14 923 + veor q9, q9, q1 924 + vstr d18, [r4, #16] 925 + vstr d19, [r4, #24] 926 + bls .L13 927 + vadd.i32 q8, q8, q10 928 + vldr d0, [r5, #32] 929 + vldr d1, [r5, #40] 930 + ldr r6, [r7, #200] 931 + vstr d16, [r6, #48] 932 + vstr d17, [r6, #56] 933 + veor q8, q13, q0 934 + vstr d16, [r4, #32] 935 + vstr d17, [r4, #40] 936 + b .L14 937 +.L12: 938 + ldr r8, [r7, #200] 939 + vstr d18, [r8, #16] 940 + vstr d19, [r8, #24] 941 + b .L14 942 +.L20: 943 + ldr r5, [r7, #184] 944 + ldr r4, [r7, #188] 945 + b .L2 946 +.L13: 947 + ldr r6, [r7, #200] 948 + vstr d26, [r6, #32] 949 + vstr d27, [r6, #40] 950 + b .L14 951 +.L42: 952 + .align 3 953 +.L41: 954 + .word 1 955 + .word 0 956 + .word 0 957 + .word 0 958 + .size CRYPTO_chacha_20_neon, .-CRYPTO_chacha_20_neon 959 + .section .rodata 960 + .align 3 961 +.LANCHOR0 = . + 0 962 +.LC0: 963 + .word 1634760805 964 + .word 857760878 965 + .word 2036477234 966 + .word 1797285236 967 + .ident "GCC: (crosstool-NG linaro-1.13.1-4.7-2012.10-20121022 - Linaro GCC 2012.10) 4.7.3 20121001 (prerelease)" 968 + .section .note.GNU-stack,"",%progbits 969 diff --git a/crypto/cryptlib.c b/crypto/cryptlib.c 970 index 7bef015..3b6ab1d 100644 971 --- a/crypto/cryptlib.c 972 +++ b/crypto/cryptlib.c 973 @@ -661,6 +661,20 @@ const char *CRYPTO_get_lock_name(int type) 974 return(sk_OPENSSL_STRING_value(app_locks,type-CRYPTO_NUM_LOCKS)); 975 } 976 977 +#if __arm__ 978 +static int global_arm_neon_enabled = 0; 979 + 980 +void CRYPTO_set_NEON_capable(int on) 981 + { 982 + global_arm_neon_enabled = on != 0; 983 + } 984 + 985 +int CRYPTO_is_NEON_capable() 986 + { 987 + return global_arm_neon_enabled; 988 + } 989 +#endif 990 + 991 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ 992 defined(__INTEL__) || \ 993 defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) 994 diff --git a/crypto/crypto.h b/crypto/crypto.h 995 index e11ac73..db339c3 100644 996 --- a/crypto/crypto.h 997 +++ b/crypto/crypto.h 998 @@ -414,6 +414,14 @@ void CRYPTO_cleanup_all_ex_data(void); 999 1000 int CRYPTO_get_new_lockid(char *name); 1001 1002 +/* CRYPTO_set_NEON_capable enables any NEON (ARM vector) dependent code. This 1003 + * code should be called before any non-init functions. */ 1004 +void CRYPTO_set_NEON_capable(int on); 1005 + 1006 +/* CRYPTO_is_NEON_capable returns the last value given to 1007 + * CRYPTO_set_NEON_capable, or else zero if it has never been called. */ 1008 +int CRYPTO_is_NEON_capable(); 1009 + 1010 int CRYPTO_num_locks(void); /* return CRYPTO_NUM_LOCKS (shared libs!) */ 1011 void CRYPTO_lock(int mode, int type,const char *file,int line); 1012 void CRYPTO_set_locking_callback(void (*func)(int mode,int type, 1013 diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c 1014 index 2e5621d..00d53bf 100644 1015 --- a/crypto/poly1305/poly1305.c 1016 +++ b/crypto/poly1305/poly1305.c 1017 @@ -90,6 +90,17 @@ static void U32TO8_LE(unsigned char *m, uint32_t v) 1018 } 1019 #endif 1020 1021 +#if __arm__ 1022 +void CRYPTO_poly1305_init_neon(poly1305_state* state, 1023 + const unsigned char key[32]); 1024 + 1025 +void CRYPTO_poly1305_update_neon(poly1305_state* state, 1026 + const unsigned char *in, 1027 + size_t in_len); 1028 + 1029 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]); 1030 +#endif 1031 + 1032 static uint64_t 1033 mul32x32_64(uint32_t a, uint32_t b) 1034 { 1035 @@ -207,6 +218,14 @@ void CRYPTO_poly1305_init(poly1305_state *statep, const unsigned char key[32]) 1036 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; 1037 uint32_t t0,t1,t2,t3; 1038 1039 +#if __arm__ 1040 + if (CRYPTO_is_NEON_capable()) 1041 + { 1042 + CRYPTO_poly1305_init_neon(statep, key); 1043 + return; 1044 + } 1045 +#endif 1046 + 1047 t0 = U8TO32_LE(key+0); 1048 t1 = U8TO32_LE(key+4); 1049 t2 = U8TO32_LE(key+8); 1050 @@ -241,6 +260,14 @@ void CRYPTO_poly1305_update(poly1305_state *statep, const unsigned char *in, 1051 unsigned int i; 1052 struct poly1305_state_st *state = (struct poly1305_state_st*) statep; 1053 1054 +#if __arm__ 1055 + if (CRYPTO_is_NEON_capable()) 1056 + { 1057 + CRYPTO_poly1305_update_neon(statep, in, in_len); 1058 + return; 1059 + } 1060 +#endif 1061 + 1062 if (state->buf_used) 1063 { 1064 unsigned int todo = 16 - state->buf_used; 1065 @@ -282,6 +309,14 @@ void CRYPTO_poly1305_finish(poly1305_state *statep, unsigned char mac[16]) 1066 uint32_t g0,g1,g2,g3,g4; 1067 uint32_t b, nb; 1068 1069 +#if __arm__ 1070 + if (CRYPTO_is_NEON_capable()) 1071 + { 1072 + CRYPTO_poly1305_finish_neon(statep, mac); 1073 + return; 1074 + } 1075 +#endif 1076 + 1077 if (state->buf_used) 1078 poly1305_update(state, state->buf, state->buf_used); 1079 1080 diff --git a/crypto/poly1305/poly1305_arm.c b/crypto/poly1305/poly1305_arm.c 1081 index adcef35..34e339d 100644 1082 --- a/crypto/poly1305/poly1305_arm.c 1083 +++ b/crypto/poly1305/poly1305_arm.c 1084 @@ -51,6 +51,7 @@ 1085 * SUPERCOP by D. J. Bernstein and Peter Schwabe. */ 1086 1087 #include <stdint.h> 1088 +#include <string.h> 1089 1090 #include <openssl/poly1305.h> 1091 1092 @@ -202,7 +203,8 @@ struct poly1305_state_st { 1093 unsigned char key[16]; 1094 }; 1095 1096 -void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) 1097 +void CRYPTO_poly1305_init_neon(poly1305_state *state, 1098 + const unsigned char key[32]) 1099 { 1100 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); 1101 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); 1102 @@ -227,7 +229,8 @@ void CRYPTO_poly1305_init(poly1305_state *state, const unsigned char key[32]) 1103 st->buf_used = 0; 1104 } 1105 1106 -void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, size_t in_len) 1107 +void CRYPTO_poly1305_update_neon(poly1305_state *state, const unsigned char *in, 1108 + size_t in_len) 1109 { 1110 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); 1111 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); 1112 @@ -285,7 +288,7 @@ void CRYPTO_poly1305_update(poly1305_state *state, const unsigned char *in, size 1113 } 1114 } 1115 1116 -void CRYPTO_poly1305_finish(poly1305_state* state, unsigned char mac[16]) 1117 +void CRYPTO_poly1305_finish_neon(poly1305_state* state, unsigned char mac[16]) 1118 { 1119 struct poly1305_state_st *st = (struct poly1305_state_st*) (state); 1120 fe1305x2 *const r = (fe1305x2 *) (st->data + (15 & (-(int) st->data))); 1121 -- 1122 1.8.4.1 1123 1124