1 diff --git a/lib/crypto/crypto_scrypt-neon-salsa208.h b/lib/crypto/crypto_scrypt-neon-salsa208.h 2 new file mode 100644 3 index 0000000..a3b1019 4 --- /dev/null 5 +++ b/lib/crypto/crypto_scrypt-neon-salsa208.h 6 @@ -0,0 +1,120 @@ 7 +/* 8 + * version 20110505 9 + * D. J. Bernstein 10 + * Public domain. 11 + * 12 + * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419 13 + */ 14 + 15 +#define ROUNDS 8 16 +static void 17 +salsa20_8_intrinsic(void * input) 18 +{ 19 + int i; 20 + 21 + const uint32x4_t abab = {-1,0,-1,0}; 22 + 23 + /* 24 + * This is modified since we only have one argument. Usually you'd rearrange 25 + * the constant, key, and input bytes, but we just have one linear array to 26 + * rearrange which is a bit easier. 27 + */ 28 + 29 + /* 30 + * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values. 31 + */ 32 + uint32x4_t x0x5x10x15; 33 + uint32x4_t x12x1x6x11; 34 + uint32x4_t x8x13x2x7; 35 + uint32x4_t x4x9x14x3; 36 + 37 + uint32x4_t x0x1x10x11; 38 + uint32x4_t x12x13x6x7; 39 + uint32x4_t x8x9x2x3; 40 + uint32x4_t x4x5x14x15; 41 + 42 + uint32x4_t x0x1x2x3; 43 + uint32x4_t x4x5x6x7; 44 + uint32x4_t x8x9x10x11; 45 + uint32x4_t x12x13x14x15; 46 + 47 + x0x1x2x3 = vld1q_u8((uint8_t *) input); 48 + x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input); 49 + x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input); 50 + x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input); 51 + 52 + x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11)); 53 + x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15)); 54 + x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3)); 55 + x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7)); 56 + 57 + x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15); 58 + x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7); 59 + x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3); 60 + x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11); 61 + 62 + uint32x4_t start0 = x0x5x10x15; 63 + uint32x4_t start1 = x12x1x6x11; 64 + uint32x4_t start3 = x4x9x14x3; 65 + uint32x4_t start2 = x8x13x2x7; 66 + 67 + /* From here on this should be the same as the SUPERCOP version. */ 68 + 69 + uint32x4_t diag0 = start0; 70 + uint32x4_t diag1 = start1; 71 + uint32x4_t diag2 = start2; 72 + uint32x4_t diag3 = start3; 73 + 74 + uint32x4_t a0; 75 + uint32x4_t a1; 76 + uint32x4_t a2; 77 + uint32x4_t a3; 78 + 79 + for (i = ROUNDS;i > 0;i -= 2) { 80 + a0 = diag1 + diag0; 81 + diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 82 + a1 = diag0 + diag3; 83 + diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 84 + a2 = diag3 + diag2; 85 + diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 86 + a3 = diag2 + diag1; 87 + diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 88 + 89 + diag3 = vextq_u32(diag3,diag3,3); 90 + diag2 = vextq_u32(diag2,diag2,2); 91 + diag1 = vextq_u32(diag1,diag1,1); 92 + 93 + a0 = diag3 + diag0; 94 + diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 95 + a1 = diag0 + diag1; 96 + diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 97 + a2 = diag1 + diag2; 98 + diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 99 + a3 = diag2 + diag3; 100 + diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 101 + 102 + diag1 = vextq_u32(diag1,diag1,3); 103 + diag2 = vextq_u32(diag2,diag2,2); 104 + diag3 = vextq_u32(diag3,diag3,1); 105 + } 106 + 107 + x0x5x10x15 = diag0 + start0; 108 + x12x1x6x11 = diag1 + start1; 109 + x8x13x2x7 = diag2 + start2; 110 + x4x9x14x3 = diag3 + start3; 111 + 112 + x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); 113 + x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); 114 + x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); 115 + x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); 116 + 117 + x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); 118 + x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); 119 + x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); 120 + x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); 121 + 122 + vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3); 123 + vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7); 124 + vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11); 125 + vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15); 126 +} 127 diff --git a/lib/crypto/crypto_scrypt-neon.c b/lib/crypto/crypto_scrypt-neon.c 128 new file mode 100644 129 index 0000000..a3bf052 130 --- /dev/null 131 +++ b/lib/crypto/crypto_scrypt-neon.c 132 @@ -0,0 +1,305 @@ 133 +/*- 134 + * Copyright 2009 Colin Percival 135 + * All rights reserved. 136 + * 137 + * Redistribution and use in source and binary forms, with or without 138 + * modification, are permitted provided that the following conditions 139 + * are met: 140 + * 1. Redistributions of source code must retain the above copyright 141 + * notice, this list of conditions and the following disclaimer. 142 + * 2. Redistributions in binary form must reproduce the above copyright 143 + * notice, this list of conditions and the following disclaimer in the 144 + * documentation and/or other materials provided with the distribution. 145 + * 146 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 147 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 148 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 149 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 150 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 151 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 152 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 153 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 154 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 155 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 156 + * SUCH DAMAGE. 157 + * 158 + * This file was originally written by Colin Percival as part of the Tarsnap 159 + * online backup system. 160 + */ 161 +#include "scrypt_platform.h" 162 + 163 +#include <machine/cpu-features.h> 164 +#include <arm_neon.h> 165 + 166 +#include <errno.h> 167 +#include <stdint.h> 168 +#include <limits.h> 169 +#include <stdlib.h> 170 +#include <string.h> 171 + 172 +#ifdef USE_OPENSSL_PBKDF2 173 +#include <openssl/evp.h> 174 +#else 175 +#include "sha256.h" 176 +#endif 177 +#include "sysendian.h" 178 + 179 +#include "crypto_scrypt.h" 180 + 181 +#include "crypto_scrypt-neon-salsa208.h" 182 + 183 +static void blkcpy(void *, void *, size_t); 184 +static void blkxor(void *, void *, size_t); 185 +void crypto_core_salsa208_armneon2(void *); 186 +static void blockmix_salsa8(uint8x16_t *, uint8x16_t *, uint8x16_t *, size_t); 187 +static uint64_t integerify(void *, size_t); 188 +static void smix(uint8_t *, size_t, uint64_t, void *, void *); 189 + 190 +static void 191 +blkcpy(void * dest, void * src, size_t len) 192 +{ 193 + uint8x16_t * D = dest; 194 + uint8x16_t * S = src; 195 + size_t L = len / 16; 196 + size_t i; 197 + 198 + for (i = 0; i < L; i++) 199 + D[i] = S[i]; 200 +} 201 + 202 +static void 203 +blkxor(void * dest, void * src, size_t len) 204 +{ 205 + uint8x16_t * D = dest; 206 + uint8x16_t * S = src; 207 + size_t L = len / 16; 208 + size_t i; 209 + 210 + for (i = 0; i < L; i++) 211 + D[i] = veorq_u8(D[i], S[i]); 212 +} 213 + 214 +/** 215 + * blockmix_salsa8(B, Y, r): 216 + * Compute B = BlockMix_{salsa20/8, r}(B). The input B must be 128r bytes in 217 + * length; the temporary space Y must also be the same size. 218 + */ 219 +static void 220 +blockmix_salsa8(uint8x16_t * Bin, uint8x16_t * Bout, uint8x16_t * X, size_t r) 221 +{ 222 + size_t i; 223 + 224 + /* 1: X <-- B_{2r - 1} */ 225 + blkcpy(X, &Bin[8 * r - 4], 64); 226 + 227 + /* 2: for i = 0 to 2r - 1 do */ 228 + for (i = 0; i < r; i++) { 229 + /* 3: X <-- H(X \xor B_i) */ 230 + blkxor(X, &Bin[i * 8], 64); 231 + salsa20_8_intrinsic((void *) X); 232 + 233 + /* 4: Y_i <-- X */ 234 + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ 235 + blkcpy(&Bout[i * 4], X, 64); 236 + 237 + /* 3: X <-- H(X \xor B_i) */ 238 + blkxor(X, &Bin[i * 8 + 4], 64); 239 + salsa20_8_intrinsic((void *) X); 240 + 241 + /* 4: Y_i <-- X */ 242 + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ 243 + blkcpy(&Bout[(r + i) * 4], X, 64); 244 + } 245 +} 246 + 247 +/** 248 + * integerify(B, r): 249 + * Return the result of parsing B_{2r-1} as a little-endian integer. 250 + */ 251 +static uint64_t 252 +integerify(void * B, size_t r) 253 +{ 254 + uint8_t * X = (void*)((uintptr_t)(B) + (2 * r - 1) * 64); 255 + 256 + return (le64dec(X)); 257 +} 258 + 259 +/** 260 + * smix(B, r, N, V, XY): 261 + * Compute B = SMix_r(B, N). The input B must be 128r bytes in length; the 262 + * temporary storage V must be 128rN bytes in length; the temporary storage 263 + * XY must be 256r bytes in length. The value N must be a power of 2. 264 + */ 265 +static void 266 +smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY) 267 +{ 268 + uint8x16_t * X = XY; 269 + uint8x16_t * Y = (void *)((uintptr_t)(XY) + 128 * r); 270 + uint8x16_t * Z = (void *)((uintptr_t)(XY) + 256 * r); 271 + uint32_t * X32 = (void *)X; 272 + uint64_t i, j; 273 + size_t k; 274 + 275 + /* 1: X <-- B */ 276 + blkcpy(X, B, 128 * r); 277 + 278 + /* 2: for i = 0 to N - 1 do */ 279 + for (i = 0; i < N; i += 2) { 280 + /* 3: V_i <-- X */ 281 + blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r); 282 + 283 + /* 4: X <-- H(X) */ 284 + blockmix_salsa8(X, Y, Z, r); 285 + 286 + /* 3: V_i <-- X */ 287 + blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r), 288 + Y, 128 * r); 289 + 290 + /* 4: X <-- H(X) */ 291 + blockmix_salsa8(Y, X, Z, r); 292 + } 293 + 294 + /* 6: for i = 0 to N - 1 do */ 295 + for (i = 0; i < N; i += 2) { 296 + /* 7: j <-- Integerify(X) mod N */ 297 + j = integerify(X, r) & (N - 1); 298 + 299 + /* 8: X <-- H(X \xor V_j) */ 300 + blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); 301 + blockmix_salsa8(X, Y, Z, r); 302 + 303 + /* 7: j <-- Integerify(X) mod N */ 304 + j = integerify(Y, r) & (N - 1); 305 + 306 + /* 8: X <-- H(X \xor V_j) */ 307 + blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); 308 + blockmix_salsa8(Y, X, Z, r); 309 + } 310 + 311 + /* 10: B' <-- X */ 312 + blkcpy(B, X, 128 * r); 313 +} 314 + 315 +/** 316 + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): 317 + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, 318 + * p, buflen) and write the result into buf. The parameters r, p, and buflen 319 + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N 320 + * must be a power of 2. 321 + * 322 + * Return 0 on success; or -1 on error. 323 + */ 324 +int 325 +crypto_scrypt(const uint8_t * passwd, size_t passwdlen, 326 + const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, 327 + uint8_t * buf, size_t buflen) 328 +{ 329 + void * B0, * V0, * XY0; 330 + uint8_t * B; 331 + uint32_t * V; 332 + uint32_t * XY; 333 + uint32_t i; 334 + 335 + /* Sanity-check parameters. */ 336 +#if SIZE_MAX > UINT32_MAX 337 + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { 338 + errno = EFBIG; 339 + goto err0; 340 + } 341 +#endif 342 + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { 343 + errno = EFBIG; 344 + goto err0; 345 + } 346 + if (((N & (N - 1)) != 0) || (N == 0)) { 347 + errno = EINVAL; 348 + goto err0; 349 + } 350 + if ((r > SIZE_MAX / 128 / p) || 351 +#if SIZE_MAX / 256 <= UINT32_MAX 352 + (r > SIZE_MAX / 256) || 353 +#endif 354 + (N > SIZE_MAX / 128 / r)) { 355 + errno = ENOMEM; 356 + goto err0; 357 + } 358 + 359 + /* Allocate memory. */ 360 +#ifdef HAVE_POSIX_MEMALIGN 361 + if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0) 362 + goto err0; 363 + B = (uint8_t *)(B0); 364 + if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0) 365 + goto err1; 366 + XY = (uint32_t *)(XY0); 367 +#ifndef MAP_ANON 368 + if ((errno = posix_memalign(&V0, 64, 128 * r * N)) != 0) 369 + goto err2; 370 + V = (uint32_t *)(V0); 371 +#endif 372 +#else 373 + if ((B0 = malloc(128 * r * p + 63)) == NULL) 374 + goto err0; 375 + B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63)); 376 + if ((XY0 = malloc(256 * r + 64 + 63)) == NULL) 377 + goto err1; 378 + XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63)); 379 +#ifndef MAP_ANON 380 + if ((V0 = malloc(128 * r * N + 63)) == NULL) 381 + goto err2; 382 + V = (uint32_t *)(((uintptr_t)(V0) + 63) & ~ (uintptr_t)(63)); 383 +#endif 384 +#endif 385 +#ifdef MAP_ANON 386 + if ((V0 = mmap(NULL, 128 * r * N, PROT_READ | PROT_WRITE, 387 +#ifdef MAP_NOCORE 388 + MAP_ANON | MAP_PRIVATE | MAP_NOCORE, 389 +#else 390 + MAP_ANON | MAP_PRIVATE, 391 +#endif 392 + -1, 0)) == MAP_FAILED) 393 + goto err2; 394 + V = (uint32_t *)(V0); 395 +#endif 396 + 397 + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ 398 +#ifdef USE_OPENSSL_PBKDF2 399 + PKCS5_PBKDF2_HMAC((const char *)passwd, passwdlen, salt, saltlen, 1, EVP_sha256(), p * 128 * r, B); 400 +#else 401 + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, p * 128 * r); 402 +#endif 403 + 404 + /* 2: for i = 0 to p - 1 do */ 405 + for (i = 0; i < p; i++) { 406 + /* 3: B_i <-- MF(B_i, N) */ 407 + smix(&B[i * 128 * r], r, N, V, XY); 408 + } 409 + 410 + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ 411 +#ifdef USE_OPENSSL_PBKDF2 412 + PKCS5_PBKDF2_HMAC((const char *)passwd, passwdlen, B, p * 128 * r, 1, EVP_sha256(), buflen, buf); 413 +#else 414 + PBKDF2_SHA256(passwd, passwdlen, B, p * 128 * r, 1, buf, buflen); 415 +#endif 416 + 417 + /* Free memory. */ 418 +#ifdef MAP_ANON 419 + if (munmap(V0, 128 * r * N)) 420 + goto err2; 421 +#else 422 + free(V0); 423 +#endif 424 + free(XY0); 425 + free(B0); 426 + 427 + /* Success! */ 428 + return (0); 429 + 430 +err2: 431 + free(XY0); 432 +err1: 433 + free(B0); 434 +err0: 435 + /* Failure! */ 436 + return (-1); 437 +} 438