1 diff --git a/lib/crypto/crypto_scrypt-neon-salsa208.h b/lib/crypto/crypto_scrypt-neon-salsa208.h 2 new file mode 100644 3 index 0000000..a3b1019 4 --- /dev/null 5 +++ b/lib/crypto/crypto_scrypt-neon-salsa208.h 6 @@ -0,0 +1,120 @@ 7 +/* 8 + * version 20110505 9 + * D. J. Bernstein 10 + * Public domain. 11 + * 12 + * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419 13 + */ 14 + 15 +#define ROUNDS 8 16 +static void 17 +salsa20_8_intrinsic(void * input) 18 +{ 19 + int i; 20 + 21 + const uint32x4_t abab = {-1,0,-1,0}; 22 + 23 + /* 24 + * This is modified since we only have one argument. Usually you'd rearrange 25 + * the constant, key, and input bytes, but we just have one linear array to 26 + * rearrange which is a bit easier. 27 + */ 28 + 29 + /* 30 + * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values. 31 + */ 32 + uint32x4_t x0x5x10x15; 33 + uint32x4_t x12x1x6x11; 34 + uint32x4_t x8x13x2x7; 35 + uint32x4_t x4x9x14x3; 36 + 37 + uint32x4_t x0x1x10x11; 38 + uint32x4_t x12x13x6x7; 39 + uint32x4_t x8x9x2x3; 40 + uint32x4_t x4x5x14x15; 41 + 42 + uint32x4_t x0x1x2x3; 43 + uint32x4_t x4x5x6x7; 44 + uint32x4_t x8x9x10x11; 45 + uint32x4_t x12x13x14x15; 46 + 47 + x0x1x2x3 = vld1q_u8((uint8_t *) input); 48 + x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input); 49 + x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input); 50 + x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input); 51 + 52 + x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11)); 53 + x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15)); 54 + x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3)); 55 + x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7)); 56 + 57 + x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15); 58 + x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7); 59 + x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3); 60 + x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11); 61 + 62 + uint32x4_t start0 = x0x5x10x15; 63 + uint32x4_t start1 = x12x1x6x11; 64 + uint32x4_t start3 = x4x9x14x3; 65 + uint32x4_t start2 = x8x13x2x7; 66 + 67 + /* From here on this should be the same as the SUPERCOP version. */ 68 + 69 + uint32x4_t diag0 = start0; 70 + uint32x4_t diag1 = start1; 71 + uint32x4_t diag2 = start2; 72 + uint32x4_t diag3 = start3; 73 + 74 + uint32x4_t a0; 75 + uint32x4_t a1; 76 + uint32x4_t a2; 77 + uint32x4_t a3; 78 + 79 + for (i = ROUNDS;i > 0;i -= 2) { 80 + a0 = diag1 + diag0; 81 + diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 82 + a1 = diag0 + diag3; 83 + diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 84 + a2 = diag3 + diag2; 85 + diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 86 + a3 = diag2 + diag1; 87 + diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 88 + 89 + diag3 = vextq_u32(diag3,diag3,3); 90 + diag2 = vextq_u32(diag2,diag2,2); 91 + diag1 = vextq_u32(diag1,diag1,1); 92 + 93 + a0 = diag3 + diag0; 94 + diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); 95 + a1 = diag0 + diag1; 96 + diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); 97 + a2 = diag1 + diag2; 98 + diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); 99 + a3 = diag2 + diag3; 100 + diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); 101 + 102 + diag1 = vextq_u32(diag1,diag1,3); 103 + diag2 = vextq_u32(diag2,diag2,2); 104 + diag3 = vextq_u32(diag3,diag3,1); 105 + } 106 + 107 + x0x5x10x15 = diag0 + start0; 108 + x12x1x6x11 = diag1 + start1; 109 + x8x13x2x7 = diag2 + start2; 110 + x4x9x14x3 = diag3 + start3; 111 + 112 + x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); 113 + x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); 114 + x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); 115 + x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); 116 + 117 + x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); 118 + x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); 119 + x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); 120 + x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); 121 + 122 + vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3); 123 + vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7); 124 + vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11); 125 + vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15); 126 +} 127 diff --git a/lib/crypto/crypto_scrypt-neon.c b/lib/crypto/crypto_scrypt-neon.c 128 new file mode 100644 129 index 0000000..a3bf052 130 --- /dev/null 131 +++ b/lib/crypto/crypto_scrypt-neon.c 132 @@ -0,0 +1,304 @@ 133 +/*- 134 + * Copyright 2009 Colin Percival 135 + * All rights reserved. 136 + * 137 + * Redistribution and use in source and binary forms, with or without 138 + * modification, are permitted provided that the following conditions 139 + * are met: 140 + * 1. Redistributions of source code must retain the above copyright 141 + * notice, this list of conditions and the following disclaimer. 142 + * 2. Redistributions in binary form must reproduce the above copyright 143 + * notice, this list of conditions and the following disclaimer in the 144 + * documentation and/or other materials provided with the distribution. 145 + * 146 + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 147 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 148 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 149 + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 150 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 151 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 152 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 153 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 154 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 155 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 156 + * SUCH DAMAGE. 157 + * 158 + * This file was originally written by Colin Percival as part of the Tarsnap 159 + * online backup system. 160 + */ 161 +#include "scrypt_platform.h" 162 + 163 +#include <arm_neon.h> 164 + 165 +#include <errno.h> 166 +#include <stdint.h> 167 +#include <limits.h> 168 +#include <stdlib.h> 169 +#include <string.h> 170 + 171 +#ifdef USE_OPENSSL_PBKDF2 172 +#include <openssl/evp.h> 173 +#else 174 +#include "sha256.h" 175 +#endif 176 +#include "sysendian.h" 177 + 178 +#include "crypto_scrypt.h" 179 + 180 +#include "crypto_scrypt-neon-salsa208.h" 181 + 182 +static void blkcpy(void *, void *, size_t); 183 +static void blkxor(void *, void *, size_t); 184 +void crypto_core_salsa208_armneon2(void *); 185 +static void blockmix_salsa8(uint8x16_t *, uint8x16_t *, uint8x16_t *, size_t); 186 +static uint64_t integerify(void *, size_t); 187 +static void smix(uint8_t *, size_t, uint64_t, void *, void *); 188 + 189 +static void 190 +blkcpy(void * dest, void * src, size_t len) 191 +{ 192 + uint8x16_t * D = dest; 193 + uint8x16_t * S = src; 194 + size_t L = len / 16; 195 + size_t i; 196 + 197 + for (i = 0; i < L; i++) 198 + D[i] = S[i]; 199 +} 200 + 201 +static void 202 +blkxor(void * dest, void * src, size_t len) 203 +{ 204 + uint8x16_t * D = dest; 205 + uint8x16_t * S = src; 206 + size_t L = len / 16; 207 + size_t i; 208 + 209 + for (i = 0; i < L; i++) 210 + D[i] = veorq_u8(D[i], S[i]); 211 +} 212 + 213 +/** 214 + * blockmix_salsa8(B, Y, r): 215 + * Compute B = BlockMix_{salsa20/8, r}(B). The input B must be 128r bytes in 216 + * length; the temporary space Y must also be the same size. 217 + */ 218 +static void 219 +blockmix_salsa8(uint8x16_t * Bin, uint8x16_t * Bout, uint8x16_t * X, size_t r) 220 +{ 221 + size_t i; 222 + 223 + /* 1: X <-- B_{2r - 1} */ 224 + blkcpy(X, &Bin[8 * r - 4], 64); 225 + 226 + /* 2: for i = 0 to 2r - 1 do */ 227 + for (i = 0; i < r; i++) { 228 + /* 3: X <-- H(X \xor B_i) */ 229 + blkxor(X, &Bin[i * 8], 64); 230 + salsa20_8_intrinsic((void *) X); 231 + 232 + /* 4: Y_i <-- X */ 233 + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ 234 + blkcpy(&Bout[i * 4], X, 64); 235 + 236 + /* 3: X <-- H(X \xor B_i) */ 237 + blkxor(X, &Bin[i * 8 + 4], 64); 238 + salsa20_8_intrinsic((void *) X); 239 + 240 + /* 4: Y_i <-- X */ 241 + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ 242 + blkcpy(&Bout[(r + i) * 4], X, 64); 243 + } 244 +} 245 + 246 +/** 247 + * integerify(B, r): 248 + * Return the result of parsing B_{2r-1} as a little-endian integer. 249 + */ 250 +static uint64_t 251 +integerify(void * B, size_t r) 252 +{ 253 + uint8_t * X = (void*)((uintptr_t)(B) + (2 * r - 1) * 64); 254 + 255 + return (le64dec(X)); 256 +} 257 + 258 +/** 259 + * smix(B, r, N, V, XY): 260 + * Compute B = SMix_r(B, N). The input B must be 128r bytes in length; the 261 + * temporary storage V must be 128rN bytes in length; the temporary storage 262 + * XY must be 256r bytes in length. The value N must be a power of 2. 263 + */ 264 +static void 265 +smix(uint8_t * B, size_t r, uint64_t N, void * V, void * XY) 266 +{ 267 + uint8x16_t * X = XY; 268 + uint8x16_t * Y = (void *)((uintptr_t)(XY) + 128 * r); 269 + uint8x16_t * Z = (void *)((uintptr_t)(XY) + 256 * r); 270 + uint32_t * X32 = (void *)X; 271 + uint64_t i, j; 272 + size_t k; 273 + 274 + /* 1: X <-- B */ 275 + blkcpy(X, B, 128 * r); 276 + 277 + /* 2: for i = 0 to N - 1 do */ 278 + for (i = 0; i < N; i += 2) { 279 + /* 3: V_i <-- X */ 280 + blkcpy((void *)((uintptr_t)(V) + i * 128 * r), X, 128 * r); 281 + 282 + /* 4: X <-- H(X) */ 283 + blockmix_salsa8(X, Y, Z, r); 284 + 285 + /* 3: V_i <-- X */ 286 + blkcpy((void *)((uintptr_t)(V) + (i + 1) * 128 * r), 287 + Y, 128 * r); 288 + 289 + /* 4: X <-- H(X) */ 290 + blockmix_salsa8(Y, X, Z, r); 291 + } 292 + 293 + /* 6: for i = 0 to N - 1 do */ 294 + for (i = 0; i < N; i += 2) { 295 + /* 7: j <-- Integerify(X) mod N */ 296 + j = integerify(X, r) & (N - 1); 297 + 298 + /* 8: X <-- H(X \xor V_j) */ 299 + blkxor(X, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); 300 + blockmix_salsa8(X, Y, Z, r); 301 + 302 + /* 7: j <-- Integerify(X) mod N */ 303 + j = integerify(Y, r) & (N - 1); 304 + 305 + /* 8: X <-- H(X \xor V_j) */ 306 + blkxor(Y, (void *)((uintptr_t)(V) + j * 128 * r), 128 * r); 307 + blockmix_salsa8(Y, X, Z, r); 308 + } 309 + 310 + /* 10: B' <-- X */ 311 + blkcpy(B, X, 128 * r); 312 +} 313 + 314 +/** 315 + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): 316 + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, 317 + * p, buflen) and write the result into buf. The parameters r, p, and buflen 318 + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N 319 + * must be a power of 2. 320 + * 321 + * Return 0 on success; or -1 on error. 322 + */ 323 +int 324 +crypto_scrypt(const uint8_t * passwd, size_t passwdlen, 325 + const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, 326 + uint8_t * buf, size_t buflen) 327 +{ 328 + void * B0, * V0, * XY0; 329 + uint8_t * B; 330 + uint32_t * V; 331 + uint32_t * XY; 332 + uint32_t i; 333 + 334 + /* Sanity-check parameters. */ 335 +#if SIZE_MAX > UINT32_MAX 336 + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { 337 + errno = EFBIG; 338 + goto err0; 339 + } 340 +#endif 341 + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { 342 + errno = EFBIG; 343 + goto err0; 344 + } 345 + if (((N & (N - 1)) != 0) || (N == 0)) { 346 + errno = EINVAL; 347 + goto err0; 348 + } 349 + if ((r > SIZE_MAX / 128 / p) || 350 +#if SIZE_MAX / 256 <= UINT32_MAX 351 + (r > SIZE_MAX / 256) || 352 +#endif 353 + (N > SIZE_MAX / 128 / r)) { 354 + errno = ENOMEM; 355 + goto err0; 356 + } 357 + 358 + /* Allocate memory. */ 359 +#ifdef HAVE_POSIX_MEMALIGN 360 + if ((errno = posix_memalign(&B0, 64, 128 * r * p)) != 0) 361 + goto err0; 362 + B = (uint8_t *)(B0); 363 + if ((errno = posix_memalign(&XY0, 64, 256 * r + 64)) != 0) 364 + goto err1; 365 + XY = (uint32_t *)(XY0); 366 +#ifndef MAP_ANON 367 + if ((errno = posix_memalign(&V0, 64, 128 * r * N)) != 0) 368 + goto err2; 369 + V = (uint32_t *)(V0); 370 +#endif 371 +#else 372 + if ((B0 = malloc(128 * r * p + 63)) == NULL) 373 + goto err0; 374 + B = (uint8_t *)(((uintptr_t)(B0) + 63) & ~ (uintptr_t)(63)); 375 + if ((XY0 = malloc(256 * r + 64 + 63)) == NULL) 376 + goto err1; 377 + XY = (uint32_t *)(((uintptr_t)(XY0) + 63) & ~ (uintptr_t)(63)); 378 +#ifndef MAP_ANON 379 + if ((V0 = malloc(128 * r * N + 63)) == NULL) 380 + goto err2; 381 + V = (uint32_t *)(((uintptr_t)(V0) + 63) & ~ (uintptr_t)(63)); 382 +#endif 383 +#endif 384 +#ifdef MAP_ANON 385 + if ((V0 = mmap(NULL, 128 * r * N, PROT_READ | PROT_WRITE, 386 +#ifdef MAP_NOCORE 387 + MAP_ANON | MAP_PRIVATE | MAP_NOCORE, 388 +#else 389 + MAP_ANON | MAP_PRIVATE, 390 +#endif 391 + -1, 0)) == MAP_FAILED) 392 + goto err2; 393 + V = (uint32_t *)(V0); 394 +#endif 395 + 396 + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ 397 +#ifdef USE_OPENSSL_PBKDF2 398 + PKCS5_PBKDF2_HMAC((const char *)passwd, passwdlen, salt, saltlen, 1, EVP_sha256(), p * 128 * r, B); 399 +#else 400 + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, p * 128 * r); 401 +#endif 402 + 403 + /* 2: for i = 0 to p - 1 do */ 404 + for (i = 0; i < p; i++) { 405 + /* 3: B_i <-- MF(B_i, N) */ 406 + smix(&B[i * 128 * r], r, N, V, XY); 407 + } 408 + 409 + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ 410 +#ifdef USE_OPENSSL_PBKDF2 411 + PKCS5_PBKDF2_HMAC((const char *)passwd, passwdlen, B, p * 128 * r, 1, EVP_sha256(), buflen, buf); 412 +#else 413 + PBKDF2_SHA256(passwd, passwdlen, B, p * 128 * r, 1, buf, buflen); 414 +#endif 415 + 416 + /* Free memory. */ 417 +#ifdef MAP_ANON 418 + if (munmap(V0, 128 * r * N)) 419 + goto err2; 420 +#else 421 + free(V0); 422 +#endif 423 + free(XY0); 424 + free(B0); 425 + 426 + /* Success! */ 427 + return (0); 428 + 429 +err2: 430 + free(XY0); 431 +err1: 432 + free(B0); 433 +err0: 434 + /* Failure! */ 435 + return (-1); 436 +} 437