diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm new file mode 100644 index 00000000..cdb6133a --- /dev/null +++ b/arm/neon/salsa20-2core.asm @@ -0,0 +1,206 @@ +C arm/neon/salsa20-2core.asm + +ifelse(< + Copyright (C) 2020 Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + + .file "salsa20-2core.asm" + .fpu neon + +define(, ) +define(, ) +define(, ) + +C State, even elements in X, odd elements in Y +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) +define(, ) + + .text + .align 4 +.Lcount1: + .int 1,0,0,0 + + C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) +PROLOGUE(_nettle_salsa20_2core) + vldm SRC, {X0,X1,X2,X3} + adr r12, .Lcount1 + + vmov Y3, X0 + vld1.64 {Y1}, [r12] + vmov Y0, X1 + vadd.i64 Y1, Y1, X2 C Increment counter + vmov Y2, X3 + + vtrn.32 X0, Y3 C X0: 0 0 2 2 Y3: 1 1 3 3 + vtrn.32 X1, Y0 C X1: 4 4 6 6 Y0: 5 5 7 7 + vtrn.32 X2, Y1 C X2: 8 8 10 10 Y1: 9 9 1 1 + vtrn.32 X3, Y2 C X3: 12 12 14 14 Y2: 13 13 15 15 + + C Swap, to get + C X0: 0 10 Y0: 5 15 + C X1: 4 14 Y1: 9 3 + C X2: 8 2 Y2: 13 7 + C X3: 12 6 Y3: 1 11 + vswp D1REG(X0), D1REG(X2) + vswp D1REG(X1), D1REG(X3) + vswp D1REG(Y0), D1REG(Y2) + vswp D1REG(Y1), D1REG(Y3) + +.Loop: +C Register layout (A is first block, B is second block) +C +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C X1: A4 B4 A14 B14 Y1: A9 B9 A3 B3 +C X2: A8 B8 A2 B2 Y2: A13 B13 A7 B7 +C X3: A12 B12 A6 B6 Y3: A1 B1 A11 B11 + + vadd.i32 T0, X0, X3 + vshl.i32 T1, T0, #7 + vadd.i32 T2, Y0, Y3 + vsri.u32 T1, T0, #25 + vshl.i32 T3, T2, #7 + veor X1, X1, T1 + vsri.u32 T3, T2, #25 + vadd.i32 T0, X1, X0 + veor Y1, Y1, T3 + vshl.i32 T1, T0, #9 + vadd.i32 T2, Y1, Y0 + vsri.u32 T1, T0, #23 + vshl.i32 T3, T2, #9 + veor X2, X2, T1 + vsri.u32 T3, T2, #23 + vadd.i32 T0, X2, X1 + veor Y2, Y2, T3 + vshl.i32 T1, T0, #13 + vadd.i32 T2, Y2, Y1 + vsri.u32 T1, T0, #19 + vshl.i32 T3, T2, #13 + veor X3, X3, T1 + vsri.u32 T3, T2, #19 + vadd.i32 T0, X3, X2 + veor Y3, Y3, T3 + vshl.i32 T1, T0, #18 + vadd.i32 T2, Y3, Y2 + vext.32 Y1, Y1, Y1, #2 + vsri.u32 T1, T0, #14 + vshl.i32 T3, T2, #18 + vext.32 Y2, Y2, Y2, #2 + veor X0, X0, T1 + vsri.u32 T3, T2, #14 + vext.32 X3, X3, X3, #2 + veor Y0, Y0, T3 + +C Register layout: +C X0: A0 B0 A10 B10 Y0: A5 A5 A15 B15 +C Y1: A3 B3 A9 B9 X1: A4 B4 A14 B14 (Y1 swapped) +C X2: A2 B2 A8 B8 Y2: A7 B7 A13 B13 (X2, Y2 swapped) +C Y3: A1 B1 A11 B11 X3: A6 B6 A12 B12 (X3 swapped) + + vadd.i32 T0, X0, Y1 + vext.32 X2, X2, X2, #2 + vshl.i32 T1, T0, #7 + vadd.i32 T2, Y0, X1 + vsri.u32 T1, T0, #25 + vshl.i32 T3, T2, #7 + veor Y3, Y3, T1 + vsri.u32 T3, T2, #25 + vadd.i32 T0, Y3, X0 + veor X3, X3, T3 + vshl.i32 T1, T0, #9 + vadd.i32 T2, X3, Y0 + vsri.u32 T1, T0, #23 + vshl.i32 T3, T2, #9 + veor X2, X2, T1 + vsri.u32 T3, T2, #23 + vadd.i32 T0, X2, Y3 + veor Y2, Y2, T3 + vshl.i32 T1, T0, #13 + vadd.i32 T2, Y2, X3 + vsri.u32 T1, T0, #19 + vshl.i32 T3, T2, #13 + veor Y1, Y1, T1 + vsri.u32 T3, T2, #19 + vadd.i32 T0, Y1, X2 + veor X1, X1, T3 + vext.32 X2, X2, X2, #2 + vshl.i32 T1, T0, #18 + vadd.i32 T2, X1, Y2 + vext.32 Y1, Y1, Y1, #2 + vsri.u32 T1, T0, #14 + subs ROUNDS, ROUNDS, #2 + vshl.i32 T3, T2, #18 + vext.32 X3, X3, X3, #2 + veor X0, X0, T1 + vsri.u32 T3, T2, #14 + vext.32 Y2, Y2, Y2, #2 + veor Y0, Y0, T3 + + bhi .Loop + +C Inverse swaps and transpositions + + vswp D1REG(X0), D1REG(X2) + vswp D1REG(X1), D1REG(X3) + vswp D1REG(Y0), D1REG(Y2) + vswp D1REG(Y1), D1REG(Y3) + + vldm SRC, {T0,T1,T2,T3} + + vtrn.32 X0, Y3 + vtrn.32 X1, Y0 + vtrn.32 X2, Y1 + vtrn.32 X3, Y2 + +C Add in the original context + vadd.i32 X0, X0, T0 + vadd.i32 X1, X1, T1 + vadd.i32 X2, X2, T2 + vadd.i32 X3, X3, T3 + + vstmia DST!, {X0,X1,X2,X3} + vld1.64 {X0}, [r12] + vadd.i32 T0, T0, Y3 + vadd.i64 T2, T2, X0 + vadd.i32 T1, T1, Y0 + vadd.i32 T2, T2, Y1 + vadd.i32 T3, T3, Y2 + + vstm DST, {T0,T1,T2,T3} + bx lr +EPILOGUE(_nettle_salsa20_2core) diff --git a/configure.ac b/configure.ac index 1c0b7393..3f6c2f3b 100644 --- a/configure.ac +++ b/configure.ac @@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ # Assembler files which generate additional object files if they are used. asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ - chacha-core-internal-2.asm \ + chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -573,6 +573,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp521r1_redc #undef HAVE_NATIVE_gcm_hash8 #undef HAVE_NATIVE_salsa20_core +#undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_sha1_compress #undef HAVE_NATIVE_sha256_compress #undef HAVE_NATIVE_sha512_compress diff --git a/salsa20-crypt.c b/salsa20-crypt.c index 770b3b4c..b25cfc3d 100644 --- a/salsa20-crypt.c +++ b/salsa20-crypt.c @@ -57,7 +57,30 @@ salsa20_crypt(struct salsa20_ctx *ctx, { if (!length) return; - + +#if HAVE_NATIVE_salsa20_2core + uint32_t x[2*_SALSA20_INPUT_LENGTH]; + while (length > SALSA20_BLOCK_SIZE) + { + _salsa20_2core (x, ctx->input, 20); + ctx->input[8] += 2; + ctx->input[9] += (ctx->input[8] < 2); + if (length < 2 * SALSA20_BLOCK_SIZE) + { + memxor3 (c, m, x, length); + return; + } + memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE); + + length -= 2*SALSA20_BLOCK_SIZE; + c += 2*SALSA20_BLOCK_SIZE; + m += 2*SALSA20_BLOCK_SIZE; + } + _salsa20_core (x, ctx->input, 20); + ctx->input[9] += (++ctx->input[8] == 0); + memxor3 (c, m, x, length); + return; +#else for (;;) { uint32_t x[_SALSA20_INPUT_LENGTH]; @@ -79,4 +102,5 @@ salsa20_crypt(struct salsa20_ctx *ctx, c += SALSA20_BLOCK_SIZE; m += SALSA20_BLOCK_SIZE; } +#endif } diff --git a/salsa20-internal.h b/salsa20-internal.h index e056b8d3..fc1bb310 100644 --- a/salsa20-internal.h +++ b/salsa20-internal.h @@ -38,8 +38,12 @@ #include "nettle-types.h" #define _salsa20_core _nettle_salsa20_core +#define _salsa20_2core _nettle_salsa20_2core void _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds); +void +_salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds); + #endif /* NETTLE_SALSA20_INTERNAL_H_INCLUDED */ diff --git a/salsa20r12-crypt.c b/salsa20r12-crypt.c index 20aecfc0..17c53975 100644 --- a/salsa20r12-crypt.c +++ b/salsa20r12-crypt.c @@ -55,13 +55,35 @@ salsa20r12_crypt(struct salsa20_ctx *ctx, uint8_t *c, const uint8_t *m) { - uint32_t x[_SALSA20_INPUT_LENGTH]; - if (!length) return; +#if HAVE_NATIVE_salsa20_2core + uint32_t x[2*_SALSA20_INPUT_LENGTH]; + while (length > SALSA20_BLOCK_SIZE) + { + _salsa20_2core (x, ctx->input, 12); + ctx->input[8] += 2; + ctx->input[9] += (ctx->input[8] < 2); + if (length < 2 * SALSA20_BLOCK_SIZE) + { + memxor3 (c, m, x, length); + return; + } + memxor3 (c, m, x, 2*SALSA20_BLOCK_SIZE); + + length -= 2*SALSA20_BLOCK_SIZE; + c += 2*SALSA20_BLOCK_SIZE; + m += 2*SALSA20_BLOCK_SIZE; + } + _salsa20_core (x, ctx->input, 20); + ctx->input[9] += (++ctx->input[8] == 0); + memxor3 (c, m, x, length); + return; +#else for (;;) { + uint32_t x[_SALSA20_INPUT_LENGTH]; _salsa20_core (x, ctx->input, 12); @@ -80,4 +102,5 @@ salsa20r12_crypt(struct salsa20_ctx *ctx, c += SALSA20_BLOCK_SIZE; m += SALSA20_BLOCK_SIZE; } +#endif } diff --git a/testsuite/salsa20-test.c b/testsuite/salsa20-test.c index 3a1b8eab..d85f69ba 100644 --- a/testsuite/salsa20-test.c +++ b/testsuite/salsa20-test.c @@ -1,5 +1,6 @@ #include "testutils.h" #include "salsa20.h" +#include "salsa20-internal.h" #include "memxor.h" @@ -118,6 +119,95 @@ test_salsa20_stream(const struct tstring *key, } } +/* Test with simple structure of the salsa20 input, for debugging of + _salsa20_core and _salsa20_2core. */ +static void +test_salsa20_core(void) +{ + const uint32_t input[2][16] = + { + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 0xffffffff, 9, 10, 11, + 12, 13, 14, 15 + }, + { + 0, 1, 2, 3, + 4, 5, 6, 7, + 0, 10, 10, 11, + 12, 13, 14, 15 + }, + }; + + const struct tstring *expected_12 + = SHEX("c456dd00835121fa 2f3f818adea91c66" + "c024ec78191dbef8 4e828fde71420f4f" + "2edb91cc7ae72fe6 1c6d96d1169241f5" + "8d34bec538389247 1b2f71089992fd2b" + "a1194b4875788ee5 731f27c32481450b" + "4cc7b2a3f8ac7f43 6f42bd16a71cb721" + "299f6d9481e4bc87 23b5c0a2f142e507" + "34b7fe35fe292f2f 1bf9ae5296afdbeb"); + + const struct tstring *expected_20 + = SHEX( + "02e02587e69cd380 3e5f3c53f0c29173" + "d3becef2da8da494 e8d1d4294270fc5e" + "a2c2001a6a45dc71 a3699e6594af795f" + "299814ae4f73650b e1d13040031dbfef" + "46b5b8ce5dc5b255 78b2695eb61fa816" + "7e22958311e2d585 826f4ebf1c7b3c98" + "a2857c3e4edc6f9e ed4312d698ddad55" + "57d13942292f8713 63eb7a5ab07a707e"); + + ASSERT (expected_12->length == 128); + ASSERT (expected_20->length == 128); + + { + uint32_t output[32]; + _salsa20_core (output, input[0], 12); + ASSERT (MEMEQ(64, output, expected_12->data)); + _salsa20_core (output, input[1], 12); + ASSERT (MEMEQ(64, output, expected_12->data+64)); + + _salsa20_core (output, input[0], 20); + ASSERT (MEMEQ(64, output, expected_20->data)); + _salsa20_core (output, input[1], 20); + ASSERT (MEMEQ(64, output, expected_20->data+64)); + } + + { + struct salsa20_ctx ctx; + uint8_t output[128]; + + /* Exercises _salsa20_2core, if available. */ + memcpy (&ctx, input[0], sizeof(ctx)); + salsa20r12_crypt (&ctx, 128, output, expected_12->data); + + if (!memzero_p (output, 128)) + { + fprintf(stderr, "salsa20r12_crypt failed:\n"); + fprintf(stderr, "\nOutput: "); + print_hex(128, output); + fprintf(stderr, "\n"); + FAIL(); + } + + memcpy (&ctx, input[0], sizeof(ctx)); + salsa20_crypt (&ctx, 128, output, expected_20->data); + + if (!memzero_p (output, 128)) + { + fprintf(stderr, "salsa20_crypt failed:\n"); + fprintf(stderr, "\nOutput: "); + print_hex(128, output); + fprintf(stderr, "\n"); + FAIL(); + } + } +} + typedef void salsa20_func(struct salsa20_ctx *ctx, size_t length, uint8_t *dst, const uint8_t *src); @@ -191,6 +281,8 @@ _test_salsa20(salsa20_func *crypt, void test_main(void) { + test_salsa20_core(); + /* http://www.ecrypt.eu.org/stream/svn/viewcvs.cgi/ecrypt/trunk/submissions/salsa20/reduced/12-rounds/verified.test-vectors?logsort=rev&rev=210&view=markup */ test_salsa20r12(SHEX("80000000 00000000 00000000 00000000"), SHEX("00000000 00000000"),