From f3b147e415a9bba6015d64d04b349c6b1d5f27b8 Mon Sep 17 00:00:00 2001 From: Jake Massimo Date: Thu, 14 May 2026 03:52:17 +0000 Subject: [PATCH 1/4] ML-DSA: add build support and importer for x86_64 assembly backend Add CMake support to compile mldsa-native x86_64 assembly files, a custom mldsa_x86_64_meta.h declaring only the assembly-backed native operations (NTT, INTT, nttunpack, pointwise, polyvecl_pointwise_acc), and the importer script to pull them from upstream. --- crypto/fipsmodule/CMakeLists.txt | 18 +++ crypto/fipsmodule/ml_dsa/importer.sh | 86 ++++++++++++- .../fipsmodule/ml_dsa/mldsa_native_backend.h | 18 +++ .../fipsmodule/ml_dsa/mldsa_native_config.h | 4 + crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h | 120 ++++++++++++++++++ 5 files changed, 243 insertions(+), 3 deletions(-) create mode 100644 crypto/fipsmodule/ml_dsa/mldsa_native_backend.h create mode 100644 crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 07859817d34..ea19c3fd2cf 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -397,6 +397,24 @@ if((ARCH STREQUAL "x86_64") AND UNIX AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) endif() +# mldsa-native assembly files can be compiled on Unix platforms for x86_64 only. +if((ARCH STREQUAL "x86_64") AND UNIX AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) + + set(MLDSA_NATIVE_DIR "${AWSLC_SOURCE_DIR}/crypto/fipsmodule/ml_dsa") + + # Every .S file in this directory is imported by importer.sh and must be + # compiled; glob so that refreshes which add/remove files don't need a + # matching edit here. CONFIGURE_DEPENDS makes CMake re-run when the set of + # matching files changes. + file(GLOB MLDSA_NATIVE_X86_64_ASM_SOURCES CONFIGURE_DEPENDS + "${MLDSA_NATIVE_DIR}/mldsa/native/x86_64/src/*.S") + + list(APPEND BCM_ASM_SOURCES ${MLDSA_NATIVE_X86_64_ASM_SOURCES}) + + set(S2N_BIGNUM_INCLUDE_DIR "${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/include") + +endif() + if(FIPS_DELOCATE) if(FIPS_SHARED) diff --git a/crypto/fipsmodule/ml_dsa/importer.sh b/crypto/fipsmodule/ml_dsa/importer.sh index 51a37e20a65..ecb3dc9765b 100755 --- a/crypto/fipsmodule/ml_dsa/importer.sh +++ b/crypto/fipsmodule/ml_dsa/importer.sh @@ -72,22 +72,57 @@ popd echo "Pull source code from remote repository..." -# Copy mldsa-native source tree -- C source only (no native backends for now) +# Copy mldsa-native source tree -- C source mkdir $SRC -cp $TMP/mldsa/src/* $SRC +# Copy only files (not subdirectories like native/ and fips202/) +find $TMP/mldsa/src -maxdepth 1 -type f -exec cp {} $SRC \; + +# Copy x86_64 backend +# We import only the assembly-backed operations (NTT, INTT, nttunpack, +# pointwise, polyvecl_pointwise_acc). The AVX2 C-intrinsic operations +# (rej_uniform, decompose, use_hint, chknorm, caddq, polyz_unpack) are +# intentionally excluded. +# +# The upstream meta.h advertises both assembly and C-intrinsic operations. +# Rather than modify it, we keep a hand-maintained replacement in +# ../mldsa_x86_64_meta.h (referenced via MLD_CONFIG_ARITH_BACKEND_FILE) that +# declares only the assembly-backed subset. Upstream meta.h is not copied. +mkdir -p $SRC/native/x86_64/src +# Backend API and specification assumed by mldsa-native frontend +cp $TMP/mldsa/src/native/api.h $SRC/native +# Backend header -- unused C-intrinsic declarations are harmless and left intact +cp $TMP/mldsa/src/native/x86_64/src/arith_native_x86_64.h $SRC/native/x86_64/src +# Shared constants (zetas table); needed by the assembly kernels +cp $TMP/mldsa/src/native/x86_64/src/consts.h $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/consts.c $SRC/native/x86_64/src +# Assembly source files for the operations we import (NTT, INTT, nttunpack, +# pointwise, polyvecl_pointwise_acc). Only files with verified proofs are +# included. +cp $TMP/mldsa/src/native/x86_64/src/ntt_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/intt_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S $SRC/native/x86_64/src # We use the custom `mldsa_native_config.h`, so can remove the default one -rm $SRC/config.h +rm -f $SRC/config.h # Copy formatting file cp $TMP/.clang-format $SRC +# ================================================================ +# Process mldsa_native_bcm.c +# ================================================================ + # Copy and statically simplify BCM file # The static simplification is not necessary, but improves readability # by removing directives related to the FIPS-202 backend that we provide # via our own glue layer. unifdef -DMLD_CONFIG_FIPS202_CUSTOM_HEADER \ -UMLD_CONFIG_USE_NATIVE_BACKEND_FIPS202 \ + -UMLD_SYS_AARCH64 \ $TMP/mldsa/mldsa_native.c \ > $SRC/mldsa_native_bcm.c @@ -110,6 +145,51 @@ cp $TMP/mldsa/mldsa_native.h $SRC echo "Fixup include paths" sed "${SED_I[@]}" 's/#include "src\/\([^"]*\)"/#include "\1"/' $SRC/mldsa_native_bcm.c +# Drop #include directives for the C-intrinsic .c files we did not import. +# Only consts.c (shared with the assembly backend) needs to be compiled. +echo "Strip C-intrinsic includes from mldsa_native_bcm.c" +BCM=$SRC/mldsa_native_bcm.c +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_caddq_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_chknorm_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_32_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_88_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_use_hint_32_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_use_hint_88_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/polyz_unpack_17_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/polyz_unpack_19_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_eta2_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_eta4_avx2\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_table\.c"/d' "$BCM" + +# ================================================================ +# Fixup x86_64 assembly backend to use s2n-bignum macros +# ================================================================ + +echo "Fixup x86_64 assembly backend to use s2n-bignum macros" +for file in $SRC/native/x86_64/src/*.S; do + echo "Processing $file" + tmp_file=$(mktemp) + + backend_define="MLD_ARITH_BACKEND_X86_64_DEFAULT" + + # Flatten multiline preprocessor directives, then process with unifdef + sed -e ':a' -e 'N' -e '$!ba' -e 's/\\\n/ /g' "$file" | \ + unifdef -D$backend_define -UMLD_CONFIG_MULTILEVEL_NO_SHARED -DMLD_CONFIG_MULTILEVEL_WITH_SHARED > "$tmp_file" + mv "$tmp_file" "$file" + + # Replace common.h include and assembly macros + s2n_header="_internal_s2n_bignum_x86_att.h" + sed "${SED_I[@]}" "s/#include \"\.\.\/\.\.\/\.\.\/common\.h\"/#include \"$s2n_header\"/" "$file" + + func_name=$(grep -o '\.global MLD_ASM_NAMESPACE(\([^)]*\))' "$file" | sed 's/\.global MLD_ASM_NAMESPACE(\([^)]*\))/\1/') + if [ -n "$func_name" ]; then + sed "${SED_I[@]}" "s/\.global MLD_ASM_NAMESPACE($func_name)/ S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_$func_name)\n S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_$func_name)/" "$file" + sed "${SED_I[@]}" "s/MLD_ASM_FN_SYMBOL($func_name)/S2N_BN_SYMBOL(mldsa_$func_name):/" "$file" + sed "${SED_I[@]}" "s/MLD_ASM_FN_SIZE($func_name)/S2N_BN_SIZE_DIRECTIVE(mldsa_$func_name)/" "$file" + fi +done + echo "Remove temporary artifacts ..." rm -rf $TMP diff --git a/crypto/fipsmodule/ml_dsa/mldsa_native_backend.h b/crypto/fipsmodule/ml_dsa/mldsa_native_backend.h new file mode 100644 index 00000000000..749f3246ff6 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa_native_backend.h @@ -0,0 +1,18 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +#ifndef MLDSA_NATIVE_BACKEND_H +#define MLDSA_NATIVE_BACKEND_H + +#include + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) + +#if defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX) +#include "mldsa_x86_64_meta.h" +#endif + +#endif + +#endif /* MLDSA_NATIVE_BACKEND_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa_native_config.h b/crypto/fipsmodule/ml_dsa/mldsa_native_config.h index b9649e1df20..2172eaffb1f 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa_native_config.h +++ b/crypto/fipsmodule/ml_dsa/mldsa_native_config.h @@ -116,4 +116,8 @@ static MLD_INLINE void *mld_memset(void *s, int c, size_t n) { #define MLD_CONFIG_NO_ASM #endif +// Enable x86_64 arithmetic backend and set path +#define MLD_CONFIG_USE_NATIVE_BACKEND_ARITH +#define MLD_CONFIG_ARITH_BACKEND_FILE "../mldsa_native_backend.h" + #endif // MLD_CONFIG_H diff --git a/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h b/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h new file mode 100644 index 00000000000..823e2850a13 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h @@ -0,0 +1,120 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +/* + * Custom x86_64 backend header for the mldsa-native import. + * + * mldsa-native's upstream meta.h declares native implementations for both + * assembly-backed operations (NTT, INTT, pointwise multiplication) and + * AVX2 C-intrinsic operations (rej_uniform, decompose, use_hint, chknorm, + * caddq, polyz_unpack). AWS-LC only imports the assembly-backed operations, + * so we replace the upstream meta.h with this trimmed-down version that + * declares only the subset we actually provide. + * + * Kept outside the imported `mldsa/` tree so that `importer.sh` does not + * need to modify upstream sources. + */ + +#ifndef MLD_NATIVE_X86_64_META_H +#define MLD_NATIVE_X86_64_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLD_ARITH_BACKEND_X86_64_DEFAULT + +#define MLD_USE_NATIVE_NTT_CUSTOM_ORDER +#define MLD_USE_NATIVE_NTT +#define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 +#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 + +#if !defined(__ASSEMBLER__) +#include "mldsa/native/api.h" +#include "mldsa/native/x86_64/src/arith_native_x86_64.h" + +static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t data[MLDSA_N]) +{ + if (mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + mld_nttunpack_avx2_asm(data); + } +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t data[MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_ntt_avx2_asm(data, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_invntt_avx2_asm(data, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_pointwise_avx2_asm(a, b, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N], + const int32_t v[4][MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_pointwise_acc_l4_avx2_asm(w, u, v, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N], + const int32_t v[5][MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_pointwise_acc_l5_avx2_asm(w, u, v, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N], + const int32_t v[7][MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_pointwise_acc_l7_avx2_asm(w, u, v, mld_qdata); + return MLD_NATIVE_FUNC_SUCCESS; +} + +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLD_NATIVE_X86_64_META_H */ From 1c4f4d2b62fabba63d26411b8e53104fa36579a2 Mon Sep 17 00:00:00 2001 From: Jake Massimo Date: Thu, 14 May 2026 03:52:32 +0000 Subject: [PATCH 2/4] ML-DSA: import x86_64 assembly backend from mldsa-native MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Output of: GITHUB_SHA=1b47ba602b3220fb06380840fd516dde4243122e ./importer.sh --force No manual changes — reproducible by running the above command. --- crypto/fipsmodule/ml_dsa/META.yml | 6 +- crypto/fipsmodule/ml_dsa/mldsa/.clang-format | 2 +- crypto/fipsmodule/ml_dsa/mldsa/cbmc.h | 39 +- crypto/fipsmodule/ml_dsa/mldsa/common.h | 54 +- crypto/fipsmodule/ml_dsa/mldsa/ct.h | 160 +- crypto/fipsmodule/ml_dsa/mldsa/debug.h | 54 +- crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h | 959 +++---- .../ml_dsa/mldsa/mldsa_native_bcm.c | 180 +- crypto/fipsmodule/ml_dsa/mldsa/native/api.h | 609 +++++ .../native/x86_64/src/arith_native_x86_64.h | 200 ++ .../ml_dsa/mldsa/native/x86_64/src/consts.c | 157 ++ .../ml_dsa/mldsa/native/x86_64/src/consts.h | 27 + .../mldsa/native/x86_64/src/intt_avx2_asm.S | 2308 ++++++++++++++++ .../mldsa/native/x86_64/src/ntt_avx2_asm.S | 2380 +++++++++++++++++ .../native/x86_64/src/nttunpack_avx2_asm.S | 235 ++ .../x86_64/src/pointwise_acc_l4_avx2_asm.S | 135 + .../x86_64/src/pointwise_acc_l5_avx2_asm.S | 151 ++ .../x86_64/src/pointwise_acc_l7_avx2_asm.S | 183 ++ .../native/x86_64/src/pointwise_avx2_asm.S | 127 + crypto/fipsmodule/ml_dsa/mldsa/packing.c | 284 +- crypto/fipsmodule/ml_dsa/mldsa/packing.h | 322 +-- crypto/fipsmodule/ml_dsa/mldsa/poly.c | 148 +- crypto/fipsmodule/ml_dsa/mldsa/poly.h | 315 ++- crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c | 148 +- crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h | 298 +-- crypto/fipsmodule/ml_dsa/mldsa/polyvec.c | 506 +--- crypto/fipsmodule/ml_dsa/mldsa/polyvec.h | 645 ++--- crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.c | 308 +++ crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.h | 653 +++++ crypto/fipsmodule/ml_dsa/mldsa/randombytes.h | 3 +- crypto/fipsmodule/ml_dsa/mldsa/reduce.h | 57 +- crypto/fipsmodule/ml_dsa/mldsa/rounding.h | 122 +- crypto/fipsmodule/ml_dsa/mldsa/sign.c | 887 +++--- crypto/fipsmodule/ml_dsa/mldsa/sign.h | 877 +++--- crypto/fipsmodule/ml_dsa/mldsa/symmetric.h | 1 - crypto/fipsmodule/ml_dsa/mldsa/sys.h | 11 + crypto/fipsmodule/ml_dsa/mldsa/zetas.inc | 1 - 37 files changed, 10291 insertions(+), 3261 deletions(-) create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/api.h create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/arith_native_x86_64.h create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.c create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.h create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/intt_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/ntt_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/nttunpack_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l4_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l5_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l7_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_avx2_asm.S create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.c create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.h diff --git a/crypto/fipsmodule/ml_dsa/META.yml b/crypto/fipsmodule/ml_dsa/META.yml index 4d0e131c3f8..523f403f388 100644 --- a/crypto/fipsmodule/ml_dsa/META.yml +++ b/crypto/fipsmodule/ml_dsa/META.yml @@ -1,5 +1,5 @@ name: mldsa-native source: pq-code-package/mldsa-native.git -branch: main -commit: b61e84f0c73d4ed612ffcaea4282a9d682de3f46 -imported-at: 2026-01-16T13:12:01-0800 +branch: 1b47ba602b3220fb06380840fd516dde4243122e +commit: 1b47ba602b3220fb06380840fd516dde4243122e +imported-at: 2026-05-14T03:52:22+0000 diff --git a/crypto/fipsmodule/ml_dsa/mldsa/.clang-format b/crypto/fipsmodule/ml_dsa/mldsa/.clang-format index 0148b907d75..6ebaa3b5a22 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/.clang-format +++ b/crypto/fipsmodule/ml_dsa/mldsa/.clang-format @@ -26,4 +26,4 @@ Macros: - __contract__(x)={ void a; void b; void c; void d; void e; void f; } void abcdefghijklmnopqrstuvw() - __loop__(x)={} do # Make this artifically long to force line break - - MLK_INTERNAL_API=void abcdefghijklmnopqrstuvwabcdefghijklmnopqrstuvwabcdefg(); + - MLD_INTERNAL_API=void abcdefghijklmnopqrstuvwabcdefghijklmnopqrstuvwabcdefg(); diff --git a/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h b/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h index 8ed1ecb06ba..d1be719714f 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/cbmc.h @@ -5,6 +5,7 @@ #ifndef MLD_CBMC_H #define MLD_CBMC_H + /*************************************************** * Basic replacements for __CPROVER_XXX contracts ***************************************************/ @@ -16,11 +17,19 @@ #else /* !CBMC */ -#include #define __contract__(x) x #define __loop__(x) x +/* Conditionally expand to __VA_ARGS__ depending on MLD_CONFIG_REDUCE_RAM. */ +#if defined(MLD_CONFIG_REDUCE_RAM) +#define MLD_IF_REDUCE_RAM(...) __VA_ARGS__ +#define MLD_IF_NOT_REDUCE_RAM(...) +#else +#define MLD_IF_REDUCE_RAM(...) +#define MLD_IF_NOT_REDUCE_RAM(...) __VA_ARGS__ +#endif + /* https://diffblue.github.io/cbmc/contracts-assigns.html */ #define assigns(...) __CPROVER_assigns(__VA_ARGS__) @@ -97,7 +106,7 @@ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \ } -#define exists(qvar, qvar_lb, qvar_ub, predicate) \ +#define exists(qvar, qvar_lb, qvar_ub, predicate) \ __CPROVER_exists \ { \ unsigned qvar; \ @@ -121,30 +130,30 @@ { \ unsigned qvar; \ ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ - (((int)(value_lb) <= ((array_var)[(qvar)])) && \ - (((array_var)[(qvar)]) < (int)(value_ub))) \ + (((int)(value_lb) <= ((array_var)[(qvar)])) && \ + (((array_var)[(qvar)]) < (int)(value_ub))) \ } #define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \ - array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \ + array_bound_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), (qvar_lb), \ (qvar_ub), (array_var), (value_lb), (value_ub)) -#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \ - __CPROVER_forall \ - { \ - unsigned qvar; \ - ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ +#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \ + __CPROVER_forall \ + { \ + unsigned qvar; \ + ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ ((array_var)[(qvar)]) == (old(* (int32_t (*)[(qvar_ub)])(array_var)))[(qvar)] \ } #define array_unchanged(array_var, N) \ array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __COUNTER__), 0, (N), (array_var)) -#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \ - __CPROVER_forall \ - { \ - unsigned qvar; \ - ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ +#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \ + __CPROVER_forall \ + { \ + unsigned qvar; \ + ((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \ ((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \ } diff --git a/crypto/fipsmodule/ml_dsa/mldsa/common.h b/crypto/fipsmodule/ml_dsa/mldsa/common.h index 72d1e98f76e..cc86c017902 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/common.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/common.h @@ -6,6 +6,11 @@ #ifndef MLD_COMMON_H #define MLD_COMMON_H +#ifndef __ASSEMBLER__ +#include +#endif + + #define MLD_BUILD_INTERNAL #if defined(MLD_CONFIG_FILE) @@ -21,8 +26,12 @@ * this can be overwritten by the user, e.g. for single-CU builds. */ #if !defined(MLD_CONFIG_INTERNAL_API_QUALIFIER) #define MLD_INTERNAL_API +#define MLD_INTERNAL_DATA_DECLARATION extern +#define MLD_INTERNAL_DATA_DEFINITION #else #define MLD_INTERNAL_API MLD_CONFIG_INTERNAL_API_QUALIFIER +#define MLD_INTERNAL_DATA_DECLARATION MLD_CONFIG_INTERNAL_API_QUALIFIER +#define MLD_INTERNAL_DATA_DEFINITION MLD_CONFIG_INTERNAL_API_QUALIFIER #endif #if !defined(MLD_CONFIG_EXTERNAL_API_QUALIFIER) @@ -77,8 +86,24 @@ */ #if defined(MLD_SYS_X86_64) #define MLD_ASM_FN_SYMBOL(sym) MLD_ASM_NAMESPACE(sym) : MLD_CET_ENDBR -#else +#elif defined(MLD_SYS_ARMV81M_MVE) +/* clang-format off */ +#define MLD_ASM_FN_SYMBOL(sym) \ + .type MLD_ASM_NAMESPACE(sym), %function; \ + MLD_ASM_NAMESPACE(sym) : +/* clang-format on */ +#else /* !MLD_SYS_X86_64 && MLD_SYS_ARMV81M_MVE */ #define MLD_ASM_FN_SYMBOL(sym) MLD_ASM_NAMESPACE(sym) : +#endif /* !MLD_SYS_X86_64 && !MLD_SYS_ARMV81M_MVE */ + +/* + * Output the size of an assembly function. + */ +#if defined(__ELF__) +#define MLD_ASM_FN_SIZE(sym) \ + .size MLD_ASM_NAMESPACE(sym), .- MLD_ASM_NAMESPACE(sym) +#else +#define MLD_ASM_FN_SIZE(sym) #endif /* We aim to simplify the user's life by supporting builds where @@ -107,6 +132,14 @@ #error Bad configuration: MLD_CONFIG_NO_RANDOMIZED_API is incompatible with MLD_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_sign_signature() #endif +#if defined(MLD_CONFIG_NO_SIGN_API) && defined(MLD_CONFIG_KEYGEN_PCT) +#error Bad configuration: MLD_CONFIG_NO_SIGN_API is incompatible with MLD_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_sign_signature() +#endif + +#if defined(MLD_CONFIG_NO_VERIFY_API) && defined(MLD_CONFIG_KEYGEN_PCT) +#error Bad configuration: MLD_CONFIG_NO_VERIFY_API is incompatible with MLD_CONFIG_KEYGEN_PCT as the current PCT implementation requires crypto_sign_verify() +#endif + #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) #include MLD_CONFIG_ARITH_BACKEND_FILE /* Include to enforce consistency of API and implementation, @@ -269,20 +302,6 @@ #endif /* MLD_CONFIG_CUSTOM_ALLOC_FREE */ -/* - * We are facing severe CBMC performance issues when using unions. - * As a temporary workaround, we use unions only when MLD_CONFIG_REDUCE_RAM is - * set. - * TODO: Remove the workaround once - * https://github.com/diffblue/cbmc/issues/8813 - * is resolved - */ -#if defined(MLD_CONFIG_REDUCE_RAM) -#define MLK_UNION_OR_STRUCT union -#else -#define MLK_UNION_OR_STRUCT struct -#endif - /****************************** Error codes ***********************************/ /* Generic failure condition */ @@ -293,6 +312,11 @@ /* An rng failure occured. Might be due to insufficient entropy or * system misconfiguration. */ #define MLD_ERR_RNG_FAIL -3 +/* The signing rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS iterations without producing a valid + * signature. With a FIPS 204 Appendix C compliant bound (>= 814) this + * has probability < 2^-256. */ +#define MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED -4 #endif /* !__ASSEMBLER__ */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/ct.h b/crypto/fipsmodule/ml_dsa/mldsa/ct.h index c307456468a..f7953b3b027 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/ct.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/ct.h @@ -26,7 +26,6 @@ #ifndef MLD_CT_H #define MLD_CT_H -#include #include "cbmc.h" #include "common.h" @@ -83,30 +82,38 @@ extern volatile uint64_t mld_ct_opt_blocker_u64; * Its validity relies on the assumption that the global opt-blocker * constant mld_ct_opt_blocker_u64 is not modified. */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint64_t mld_ct_get_optblocker_u64(void) __contract__(ensures(return_value == 0)) { return mld_ct_opt_blocker_u64; } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int64_t mld_ct_get_optblocker_i64(void) __contract__(ensures(return_value == 0)) { return (int64_t)mld_ct_get_optblocker_u64(); } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint32_t mld_ct_get_optblocker_u32(void) __contract__(ensures(return_value == 0)) { return (uint32_t)mld_ct_get_optblocker_u64(); } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint8_t mld_ct_get_optblocker_u8(void) __contract__(ensures(return_value == 0)) { return (uint8_t)mld_ct_get_optblocker_u64(); } /* Opt-blocker based implementation of value barriers */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int64_t mld_value_barrier_i64(int64_t b) __contract__(ensures(return_value == b)) { return (b ^ mld_ct_get_optblocker_i64()); } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint32_t mld_value_barrier_u32(uint32_t b) __contract__(ensures(return_value == b)) { return (b ^ mld_ct_get_optblocker_u32()); } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint8_t mld_value_barrier_u8(uint8_t b) __contract__(ensures(return_value == b)) { return (b ^ mld_ct_get_optblocker_u8()); } #else /* !MLD_USE_ASM_VALUE_BARRIER */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int64_t mld_value_barrier_i64(int64_t b) __contract__(ensures(return_value == b)) { @@ -114,6 +121,7 @@ __contract__(ensures(return_value == b)) return b; } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint32_t mld_value_barrier_u32(uint32_t b) __contract__(ensures(return_value == b)) { @@ -121,6 +129,7 @@ __contract__(ensures(return_value == b)) return b; } +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint8_t mld_value_barrier_u8(uint8_t b) __contract__(ensures(return_value == b)) { @@ -134,19 +143,17 @@ __contract__(ensures(return_value == b)) #pragma CPROVER check disable "conversion" #endif -/************************************************* - * Name: mld_cast_uint32_to_int32 +/** + * Cast uint32 value to int32. * - * Description: Cast uint32 value to int32 + * @param x Input value. * - * Returns: For uint32_t x, the unique y in int32_t - * so that x == y mod 2^32. - * - * Concretely: - * - x < 2^31: returns x - * - x >= 2^31: returns x - 2^31 - * - **************************************************/ + * @return For uint32_t x, the unique y in int32_t so that x == y mod 2^32. + * Concretely: + * - x < 2^31: returns x + * - x >= 2^31: returns x - 2^31 + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_ALWAYS_INLINE int32_t mld_cast_uint32_to_int32(uint32_t x) { /* @@ -164,45 +171,43 @@ static MLD_ALWAYS_INLINE int32_t mld_cast_uint32_to_int32(uint32_t x) #endif -/************************************************* - * Name: mld_cast_int64_to_uint32 +/** + * Cast int64 value to uint32 as per C standard. * - * Description: Cast int64 value to uint32 as per C standard. + * @param x Input value. * - * Returns: For int64_t x, the unique y in uint32_t - * so that x == y mod 2^32. - **************************************************/ + * @return For int64_t x, the unique y in uint32_t so that x == y mod 2^32. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_ALWAYS_INLINE uint32_t mld_cast_int64_to_uint32(int64_t x) { return (uint32_t)(x & (int64_t)UINT32_MAX); } -/************************************************* - * Name: mld_cast_int32_to_uint32 +/** + * Cast int32 value to uint32 as per C standard. * - * Description: Cast int32 value to uint32 as per C standard. + * @param x Input value. * - * Returns: For int32_t x, the unique y in uint32_t - * so that x == y mod 2^32. - **************************************************/ + * @return For int32_t x, the unique y in uint32_t so that x == y mod 2^32. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_ALWAYS_INLINE uint32_t mld_cast_int32_to_uint32(int32_t x) { return mld_cast_int64_to_uint32((int64_t)x); } -/************************************************* - * Name: mld_ct_sel_int32 - * - * Description: Functionally equivalent to cond ? a : b, - * but implemented with guards against - * compiler-introduced branches. +/** + * Functionally equivalent to cond ? a : b, but implemented with guards against + * compiler-introduced branches. * - * Arguments: int32_t a: First alternative - * int32_t b: Second alternative - * uint32_t cond: Condition variable. + * @param a First alternative. + * @param b Second alternative. + * @param cond Condition variable. * - * - **************************************************/ + * @return a if cond is 0xFFFFFFFF, b if cond is 0. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_ct_sel_int32(int32_t a, int32_t b, uint32_t cond) __contract__( requires(cond == 0x0 || cond == 0xFFFFFFFF) @@ -215,14 +220,12 @@ __contract__( return mld_cast_uint32_to_int32(res); } -/************************************************* - * Name: mld_ct_cmask_nonzero_u32 - * - * Description: Return 0 if input is zero, and -1 otherwise. +/** + * Return 0 if input is zero, and -1 otherwise. * - * Arguments: uint32_t x: Value to be converted into a mask - * - **************************************************/ + * @param x Value to be converted into a mask. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint32_t mld_ct_cmask_nonzero_u32(uint32_t x) __contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFFFFFF))) { @@ -231,14 +234,12 @@ __contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFFFFFF))) return mld_cast_int64_to_uint32(tmp); } -/************************************************* - * Name: mld_ct_cmask_nonzero_u8 - * - * Description: Return 0 if input is zero, and -1 otherwise. +/** + * Return 0 if input is zero, and -1 otherwise. * - * Arguments: uint8_t x: Value to be converted into a mask - * - **************************************************/ + * @param x Value to be converted into a mask. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint8_t mld_ct_cmask_nonzero_u8(uint8_t x) __contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF))) { @@ -246,14 +247,12 @@ __contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF))) return (uint8_t)(mask & 0xFF); } -/************************************************* - * Name: mld_ct_cmask_neg_i32 - * - * Description: Return 0 if input is non-negative, and -1 otherwise. +/** + * Return 0 if input is non-negative, and -1 otherwise. * - * Arguments: int32_t x: Value to be converted into a mask - * - **************************************************/ + * @param x Value to be converted into a mask. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint32_t mld_ct_cmask_neg_i32(int32_t x) __contract__( ensures(return_value == ((x < 0) ? 0xFFFFFFFF : 0)) @@ -264,14 +263,12 @@ __contract__( return mld_cast_int64_to_uint32(tmp); } -/************************************************* - * Name: mld_ct_abs_i32 - * - * Description: Return -x if x<0, x otherwise +/** + * Return -x if x<0, x otherwise. * - * Arguments: int32_t x: Input value - * - **************************************************/ + * @param x Input value. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_ct_abs_i32(int32_t x) __contract__( requires(x >= -INT32_MAX) @@ -281,19 +278,17 @@ __contract__( return mld_ct_sel_int32(-x, x, mld_ct_cmask_neg_i32(x)); } -/************************************************* - * Name: mld_ct_memcmp - * - * Description: Compare two arrays for equality in constant time. +/** + * Compare two arrays for equality in constant time. * - * Arguments: const uint8_t *a: pointer to first byte array - * const uint8_t *b: pointer to second byte array - * size_t len: length of the byte arrays, upper-bounded - * to UINT16_MAX to control proof complexity - * only. + * @param[in] a Pointer to first byte array. + * @param[in] b Pointer to second byte array. + * @param len Length of the byte arrays, upper-bounded to UINT16_MAX to + * control proof complexity only. * - * Returns 0 if the byte arrays are equal, 0xFF otherwise. - **************************************************/ + * @return 0 if the byte arrays are equal, 0xFF otherwise. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE uint8_t mld_ct_memcmp(const uint8_t *a, const uint8_t *b, const size_t len) __contract__( @@ -309,7 +304,8 @@ __contract__( for (i = 0; i < len; i++) __loop__( invariant(i <= len) - invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k]))))) + invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))) + decreases(len - i)) { r |= a[i] ^ b[i]; /* s is useless, but prevents the loop from being aborted once r=0xff. */ @@ -326,16 +322,14 @@ __contract__( return (mld_value_barrier_u8(mld_ct_cmask_nonzero_u8(r) ^ s) ^ s); } -/************************************************* - * Name: mld_zeroize +/** + * Force-zeroize a buffer. * - * Description: Force-zeroize a buffer. - * @[FIPS204, Section 3.6.3] Destruction of intermediate - * values. + * @[FIPS204, Section 3.6.3] Destruction of intermediate values. * - * Arguments: void *ptr: pointer to buffer to be zeroed - * size_t len: Amount of bytes to be zeroed - **************************************************/ + * @param[out] ptr Pointer to buffer to be zeroed. + * @param len Amount of bytes to be zeroed. + */ #if !defined(MLD_CONFIG_CUSTOM_ZEROIZE) #if defined(MLD_SYS_WINDOWS) #include diff --git a/crypto/fipsmodule/ml_dsa/mldsa/debug.h b/crypto/fipsmodule/ml_dsa/mldsa/debug.h index af187bb9de6..b6b468913af 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/debug.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/debug.h @@ -8,39 +8,31 @@ #include "common.h" #if defined(MLDSA_DEBUG) -#include -/************************************************* - * Name: mld_assert +/** + * Check debug assertion. * - * Description: Check debug assertion + * Prints an error message to stderr and calls exit(1) if not. * - * Prints an error message to stderr and calls - * exit(1) if not. - * - * Arguments: - file: filename - * - line: line number - * - val: Value asserted to be non-zero - **************************************************/ + * @param file Filename. + * @param line Line number. + * @param val Value asserted to be non-zero. + */ #define mld_debug_check_assert MLD_NAMESPACE(mldsa_debug_assert) void mld_debug_check_assert(const char *file, int line, const int val); -/************************************************* - * Name: mld_debug_check_bounds +/** + * Check whether values in an array of int32_t are within specified bounds. * - * Description: Check whether values in an array of int32_t - * are within specified bounds. + * Prints an error message to stderr and calls exit(1) if not. * - * Prints an error message to stderr and calls - * exit(1) if not. - * - * Arguments: - file: filename - * - line: line number - * - ptr: Base of array to be checked - * - len: Number of int32_t in ptr - * - lower_bound_exclusive: Exclusive lower bound - * - upper_bound_exclusive: Exclusive upper bound - **************************************************/ + * @param file Filename. + * @param line Line number. + * @param[in] ptr Base of array to be checked. + * @param len Number of int32_t in ptr. + * @param lower_bound_exclusive Exclusive lower bound. + * @param upper_bound_exclusive Exclusive upper bound. + */ #define mld_debug_check_bounds MLD_NAMESPACE(mldsa_debug_check_bounds) void mld_debug_check_bounds(const char *file, int line, const int32_t *ptr, unsigned len, int64_t lower_bound_exclusive, @@ -91,14 +83,14 @@ void mld_debug_check_bounds(const char *file, int line, const int32_t *ptr, /* Because of https://github.com/diffblue/cbmc/issues/8570, we can't * just use a single flattened array_bound(...) here. */ -#define mld_assert_bound_2d(ptr, M, N, value_lb, value_ub) \ - cassert(forall(kN, 0, (M), \ - array_bound(&((int32_t(*)[(N)])(ptr))[kN][0], 0, (N), \ +#define mld_assert_bound_2d(ptr, M, N, value_lb, value_ub) \ + cassert(forall(kN, 0, (M), \ + array_bound(&((int32_t (*)[(N)])(ptr))[kN][0], 0, (N), \ (value_lb), (value_ub)))) -#define mld_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \ - cassert(forall(kN, 0, (M), \ - array_abs_bound(&((int32_t(*)[(N)])(ptr))[kN][0], 0, (N), \ +#define mld_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \ + cassert(forall(kN, 0, (M), \ + array_abs_bound(&((int32_t (*)[(N)])(ptr))[kN][0], 0, (N), \ (value_abs_bd)))) #else /* !MLDSA_DEBUG && CBMC */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h index fd150a9df1e..df93f2910e7 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native.h @@ -132,6 +132,11 @@ /* An rng failure occured. Might be due to insufficient entropy or * system misconfiguration. */ #define MLD_ERR_RNG_FAIL -3 +/* The signing rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS iterations without producing a valid + * signature. With a FIPS 204 Appendix C compliant bound (>= 814) this + * has probability < 2^-256. */ +#define MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED -4 /****************************** Function API **********************************/ @@ -205,32 +210,38 @@ extern "C" { #endif -/************************************************* - * Name: crypto_sign_keypair_internal - * - * Description: Generates public and private key. Internal API. - * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise - * Consistency Test (PCT) as required by FIPS 140-3 IG. - * - * Arguments: - * - uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output public key - * - uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output private key - * - const uint8_t seed[MLDSA_SEEDBYTES]: - * input random seed - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure, incl. PCT failure - * if MLD_CONFIG_KEYGEN_PCT is enabled. - * - * Specification: Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)] - * - **************************************************/ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +/** + * Generate a public-private key pair from a seed. + * + * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise Consistency Test + * (PCT) as required by FIPS 140-3 IG. + * + * @warning The seed must be generated by a cryptographically secure random + * number generator. + * + * @spec{Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)].} + * + * @param[out] pk Output public key. + * @param[out] sk Output private key. + * @param[in] seed Input random seed. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The PCT's signing step exhausted + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. Only possible when + * MLD_CONFIG_KEYGEN_PCT is enabled. + * @retval MLD_ERR_FAIL Other kinds of failure, including + * PCT failure if + * MLD_CONFIG_KEYGEN_PCT is enabled. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(keypair_internal)( @@ -243,29 +254,33 @@ int MLD_API_NAMESPACE(keypair_internal)( #endif ); -/************************************************* - * Name: crypto_sign_keypair - * - * Description: Generates public and private key. - * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise - * Consistency Test (PCT) as required by FIPS 140-3 IG. - * - * Arguments: - * - uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output public key - * - uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output private key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: If MLD_CONFIG_KEYGEN_PCT is enabled and the - * PCT check failed. - * - * Specification: Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)] - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Generate a public-private key pair. + * + * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise Consistency Test + * (PCT) as required by FIPS 140-3 IG. + * + * @spec{Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)].} + * + * @param[out] pk Output public key. + * @param[out] sk Output private key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The PCT's signing step exhausted + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. Only possible when + * MLD_CONFIG_KEYGEN_PCT is enabled. + * @retval MLD_ERR_FAIL MLD_CONFIG_KEYGEN_PCT is enabled and + * the PCT check failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(keypair)( @@ -276,39 +291,48 @@ int MLD_API_NAMESPACE(keypair)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); - -/************************************************* - * Name: crypto_sign_signature_internal - * - * Description: Computes signature. Internal API. - * - * Arguments: - * - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - const uint8_t *pre: pointer to prefix string - * - size_t prelen: length of prefix string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - int externalmu: indicates input message m is processed as mu - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - * - * If the returned value is non-zero, then the values of *sig and - * *siglen should not be referenced. - * - * Reference: This code differs from the reference implementation - * in that it adds an explicit check for nonce exhaustion - * and can return -1 in that case. - **************************************************/ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * Compute signature using a caller-supplied random seed and prefix. + * + * If the returned value is non-zero, then the values of *sig and *siglen + * should not be referenced. + * + * @spec{Implements @[FIPS204 Algorithm 7 (ML-DSA.Sign_internal)].} + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be signed (when + * externalmu == 0), or to a precomputed + * message representative mu (when externalmu != 0). + * @param mlen Length of m. Must equal MLDSA_CRHBYTES when + * externalmu != 0. + * @param[in] pre Pointer to prefix string. Ignored when + * externalmu != 0. + * @param prelen Length of prefix string. Ignored when + * externalmu != 0. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param externalmu 0: m/mlen is the raw message; mu = H(tr, pre, m) is + * computed internally. + * non-zero: m points to a precomputed mu of + * MLDSA_CRHBYTES bytes; pre/prelen unused. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(signature_internal)( @@ -323,36 +347,35 @@ int MLD_API_NAMESPACE(signature_internal)( #endif ); -/************************************************* - * Name: crypto_sign_signature - * - * Description: Computes signature. This function implements the randomized - * variant of ML-DSA. If you require the deterministic variant, - * use crypto_sign_signature_internal directly. - * - * Arguments: - * - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string. - * May be NULL if ctxlen == 0. - * - size_t ctxlen: length of context string. - * Should be <= 255. - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure. - * - * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)] - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Compute signature. This function implements the randomized variant of + * ML-DSA. If you require the deterministic variant, use + * crypto_sign_signature_internal directly. + * + * @spec{Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)].} + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. May be NULL if ctxlen == 0. + * @param ctxlen Length of context string. Should be <= 255. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(signature)( @@ -365,31 +388,33 @@ int MLD_API_NAMESPACE(signature)( #endif ); -/************************************************* - * Name: crypto_sign_signature_extmu - * - * Description: Computes signature. - * - * Arguments: - * - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t mu[MLDSA_CRHBYTES]: - * input mu to be signed - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure. - * - * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu - * variant)] - * - **************************************************/ +/** + * Compute signature in "external mu" mode: the caller has already computed + * the message representative mu = SHAKE256(tr || M', 64), where + * tr = SHAKE256(pk, 64) and M' is the FIPS 204 formatted message (e.g. + * 0x00 || ctxlen || ctx || msg for pure ML-DSA). This is useful when the + * message is large or streamed and cannot be held in memory. + * + * @spec{Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu variant)].} + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] mu Precomputed message representative. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(signature_extmu)( @@ -402,31 +427,32 @@ int MLD_API_NAMESPACE(signature_extmu)( #endif ); -/************************************************* - * Name: crypto_sign - * - * Description: Computes signature. This function implements the randomized - * variant of ML-DSA. If you require the deterministic variant, - * use crypto_sign_signature_internal directly. - * - * Arguments: - * - uint8_t *sm: pointer to output signed message (allocated array - * with MLDSA{44,65,87}_BYTES + mlen bytes), can be - * equal to m - * - size_t *smlen: pointer to output length of signed message - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - **************************************************/ +/** + * Compute signed message. This function implements the randomized variant of + * ML-DSA. If you require the deterministic variant, use + * crypto_sign_signature_internal directly. + * + * @param[out] sm Pointer to output signed message (allocated array with + * MLDSA{44,65,87}_BYTES + mlen bytes); can be equal to m. + * @param[out] smlen Pointer to output length of signed message. + * @param[in] m Pointer to message to be signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(sign)( @@ -438,32 +464,38 @@ int MLD_API_NAMESPACE(sign)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); - -/************************************************* - * Name: crypto_sign_verify_internal - * - * Description: Verifies signature. Internal API. - * - * Arguments: - * - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message - * - size_t mlen: length of message - * - const uint8_t *pre: pointer to prefix string - * - size_t prelen: length of prefix string - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - int externalmu: indicates input message m is processed as mu - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)] - * - **************************************************/ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * Verify signature. Internal API. + * + * @spec{Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message (when externalmu == 0), or to a + * precomputed message representative mu (when + * externalmu != 0). + * @param mlen Length of m. Must equal MLDSA_CRHBYTES when + * externalmu != 0. + * @param[in] pre Pointer to prefix string. Ignored when externalmu != 0. + * @param prelen Length of prefix string. Ignored when externalmu != 0. + * @param[in] pk Bit-packed public key. + * @param externalmu 0: m/mlen is the raw message; mu = H(H(pk), pre, m) is + * computed internally. + * non-zero: m points to a precomputed mu of + * MLDSA_CRHBYTES bytes; pre/prelen unused. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(verify_internal)( @@ -477,31 +509,28 @@ int MLD_API_NAMESPACE(verify_internal)( #endif ); -/************************************************* - * Name: crypto_sign_verify - * - * Description: Verifies signature. - * - * Arguments: - * - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string. - * May be NULL if ctxlen == 0. - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)] - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Verify signature. + * + * @spec{Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. May be NULL if ctxlen == 0. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(verify)( @@ -514,29 +543,28 @@ int MLD_API_NAMESPACE(verify)( #endif ); -/************************************************* - * Name: crypto_sign_verify_extmu - * - * Description: Verifies signature. - * - * Arguments: - * - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t mu[MLDSA_CRHBYTES]: - * input mu - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu - * variant)] - * - **************************************************/ +/** + * Verify signature in "external mu" mode: the caller has already computed + * the message representative mu = SHAKE256(tr || M', 64), where + * tr = SHAKE256(pk, 64) and M' is the FIPS 204 formatted message (e.g. + * 0x00 || ctxlen || ctx || msg for pure ML-DSA). The same mu must have + * been used at signing time. + * + * @spec{Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu variant)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] mu Precomputed message representative. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(verify_extmu)( @@ -548,28 +576,26 @@ int MLD_API_NAMESPACE(verify_extmu)( #endif ); -/************************************************* - * Name: crypto_sign_open - * - * Description: Verify signed message. - * - * Arguments: - * - uint8_t *m: pointer to output message (allocated array with - * smlen bytes), can be equal to sm - * - size_t *mlen: pointer to output length of message - * - const uint8_t *sm: pointer to signed message - * - size_t smlen: length of signed message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - **************************************************/ +/** + * Verify signed message. + * + * @param[out] m Pointer to output message (allocated array with smlen + * bytes); can be equal to sm. + * @param[out] mlen Pointer to output length of message. + * @param[in] sm Pointer to signed message. + * @param smlen Length of signed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(open)( @@ -581,10 +607,10 @@ int MLD_API_NAMESPACE(open)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Hash algorithm constants for domain separation - **************************************************/ +/* Hash algorithm constants for domain separation */ #define MLD_PREHASH_NONE 0 #define MLD_PREHASH_SHA2_224 1 #define MLD_PREHASH_SHA2_256 2 @@ -599,41 +625,43 @@ int MLD_API_NAMESPACE(open)( #define MLD_PREHASH_SHAKE_128 11 #define MLD_PREHASH_SHAKE_256 12 -/************************************************* - * Name: crypto_sign_signature_pre_hash_internal - * - * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign. - * Computes signature with pre-hashed message. - * - * Arguments: - * - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *ph: pointer to pre-hashed message - * - size_t phlen: length of pre-hashed message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - int hashalg: hash algorithm constant (one of MLD_PREHASH_*) - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure +#if !defined(MLD_CONFIG_CORE_API_ONLY) +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * FIPS 204: Algorithm 4 HashML-DSA.Sign. Compute signature with pre-hashed + * message. * * Supported hash algorithm constants: * MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384, * MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256, * MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384, - * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256 + * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256. * - * Warning: This is an unstable API that may change in the future. If you need + * @warning This is an unstable API that may change in the future. If you need * a stable API use crypto_sign_signature_pre_hash_shake256. - **************************************************/ + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] ph Pointer to pre-hashed message. + * @param phlen Length of pre-hashed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param hashalg Hash algorithm constant (one of MLD_PREHASH_*). + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(signature_pre_hash_internal)( @@ -647,38 +675,39 @@ int MLD_API_NAMESPACE(signature_pre_hash_internal)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -/************************************************* - * Name: crypto_sign_verify_pre_hash_internal - * - * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify. - * Verifies signature with pre-hashed message. - * - * Arguments: - * - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *ph: pointer to pre-hashed message - * - size_t phlen: length of pre-hashed message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - int hashalg: hash algorithm constant (one of MLD_PREHASH_*) - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * FIPS 204: Algorithm 5 HashML-DSA.Verify. Verifies signature with pre-hashed + * message. * * Supported hash algorithm constants: * MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384, * MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256, * MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384, - * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256 + * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256. * - * Warning: This is an unstable API that may change in the future. If you need + * @warning This is an unstable API that may change in the future. If you need * a stable API use crypto_sign_verify_pre_hash_shake256. - **************************************************/ + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] ph Pointer to pre-hashed message. + * @param phlen Length of pre-hashed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param hashalg Hash algorithm constant (one of MLD_PREHASH_*). + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(verify_pre_hash_internal)( @@ -691,34 +720,36 @@ int MLD_API_NAMESPACE(verify_pre_hash_internal)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); - -/************************************************* - * Name: crypto_sign_signature_pre_hash_shake256 - * - * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256. - * Computes signature with pre-hashed message using SHAKE256. - * This function computes the SHAKE256 hash of the message - * internally. - * - * Arguments: - * - uint8_t sig[MLDSA_BYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *m: pointer to message to be hashed and signed - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_SECRETKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed secret key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - **************************************************/ +#endif /* !MLD_CONFIG_NO_VERIFY_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256. + * + * Compute signature with pre-hashed message using SHAKE256. This function + * computes the SHAKE256 hash of the message internally. + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be hashed and signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(signature_pre_hash_shake256)( @@ -731,31 +762,31 @@ int MLD_API_NAMESPACE(signature_pre_hash_shake256)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); - -/************************************************* - * Name: crypto_sign_verify_pre_hash_shake256 - * - * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256. - * Verifies signature with pre-hashed message using SHAKE256. - * This function computes the SHAKE256 hash of the message - *internally. - * - * Arguments: - * - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message to be hashed and verified - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_PUBLICKEYBYTES(MLD_CONFIG_API_PARAMETER_SET)]: - * bit-packed public key - * - * Returns: - * - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - **************************************************/ +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256. + * + * Verify signature with pre-hashed message using SHAKE256. This function + * computes the SHAKE256 hash of the message internally. + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message to be hashed and verified. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(verify_pre_hash_shake256)( @@ -767,79 +798,76 @@ int MLD_API_NAMESPACE(verify_pre_hash_shake256)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ /* Maximum formatted domain separation message length */ #define MLD_DOMAIN_SEPARATION_MAX_BYTES (2 + 255 + 11 + 64) -/************************************************* - * Name: mld_prepare_domain_separation_prefix - * - * Description: Prepares domain separation prefix for ML-DSA signing. - * For pure ML-DSA (hashalg == MLD_PREHASH_NONE): - * Format: 0x00 || ctxlen (1 byte) || ctx - * For HashML-DSA (hashalg != MLD_PREHASH_NONE): - * Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph - * - * Arguments: - uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES]: - * output domain separation prefix buffer - * - const uint8_t *ph: pointer to pre-hashed message - * (ignored for pure ML-DSA) - * - size_t phlen: length of pre-hashed message - * (ignored for pure ML-DSA) - * - const uint8_t *ctx: pointer to context string (may be NULL) - * - size_t ctxlen: length of context string - * - int hashalg: hash algorithm constant - * (MLD_PREHASH_NONE for pure ML-DSA, or MLD_PREHASH_* for - * HashML-DSA) - * - * Returns the total length of the formatted prefix, or 0 on error. +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Prepare domain separation prefix for ML-DSA signing. + * + * For pure ML-DSA (hashalg == MLD_PREHASH_NONE): + * Format: 0x00 || ctxlen (1 byte) || ctx. + * + * For HashML-DSA (hashalg != MLD_PREHASH_NONE): + * Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph. * * This function is useful for building incremental signing APIs. * - * Specification: - * - For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements - * @[FIPS204, Algorithm 4, L23] - * - For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), implements - * ``` - * M' <- BytesToBits(IntegerToBytes(0, 1) - * || IntegerToBytes(|ctx|, 1) - * || ctx - * ``` - * which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and - * @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5]. - * - **************************************************/ + * @spec{For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements + * @[FIPS204, Algorithm 4, L23]. For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), + * implements + * ``` + * M' <- BytesToBits(IntegerToBytes(0, 1) + * || IntegerToBytes(|ctx|, 1) + * || ctx + * ``` + * which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and + * @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5].} + * + * @param[out] prefix Output domain separation prefix buffer. + * @param[in] ph Pointer to pre-hashed message (ignored for pure + * ML-DSA). + * @param phlen Length of pre-hashed message (ignored for pure ML-DSA). + * @param[in] ctx Pointer to context string (may be NULL). + * @param ctxlen Length of context string. + * @param hashalg Hash algorithm constant (MLD_PREHASH_NONE for pure + * ML-DSA, or MLD_PREHASH_* for HashML-DSA). + * + * @return The total length of the formatted prefix, or 0 on error. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE size_t MLD_API_NAMESPACE(prepare_domain_separation_prefix)( uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES], const uint8_t *ph, size_t phlen, const uint8_t *ctx, size_t ctxlen, int hashalg); -/************************************************* - * Name: crypto_sign_pk_from_sk - * - * Description: Performs basic validity checks on secret key, and derives - * public key. - * - * Referring to the decoding of the secret key - * `sk=(rho, K, tr, s1, s2, t0)` - * (cf. [@FIPS204, Algorithm 25 skDecode]), - * the following checks are performed: - * - Check that s1 and s2 have coefficients in - * [-MLDSA_ETA, MLDSA_ETA] - * - Check that t0 and tr stored in sk match recomputed values. - * - * Arguments: - uint8_t pk[CRYPTO_PUBLICKEYBYTES]: output public key - * - const uint8_t sk[CRYPTO_SECRETKEYBYTES]: input secret key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Secret key validation failed - * - * Note: This function leaks whether the secret key is valid or invalid - * through its return value and timing. - **************************************************/ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +/** + * Perform basic validity checks on secret key, and derive public key. + * + * Referring to the decoding of the secret key `sk=(rho, K, tr, s1, s2, t0)` + * (cf. @[FIPS204, Algorithm 25 skDecode]), the following checks are + * performed: + * - Check that s1 and s2 have coefficients in [-MLDSA_ETA, MLDSA_ETA]. + * - Check that t0 and tr stored in sk match recomputed values. + * + * @note This function leaks whether the secret key is valid or invalid + * through its return value and timing. + * + * @param[out] pk Output public key. + * @param[in] sk Input secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Secret key validation failed. + */ MLD_API_QUALIFIER MLD_API_MUST_CHECK_RETURN_VALUE int MLD_API_NAMESPACE(pk_from_sk)( @@ -850,6 +878,8 @@ int MLD_API_NAMESPACE(pk_from_sk)( MLD_CONFIG_CONTEXT_PARAMETER_TYPE context #endif ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ #ifdef __cplusplus } @@ -912,31 +942,37 @@ int MLD_API_NAMESPACE(pk_from_sk)( */ /* check-magic: off */ #if defined(MLD_API_LEGACY_CONFIG) || !defined(MLD_CONFIG_REDUCE_RAM) -#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 45248 -#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 56640 -#define MLD_TOTAL_ALLOC_44_SIGN 52896 -#define MLD_TOTAL_ALLOC_44_VERIFY 38816 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 71872 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 85856 -#define MLD_TOTAL_ALLOC_65_SIGN 80576 -#define MLD_TOTAL_ALLOC_65_VERIFY 62432 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 112832 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 130816 -#define MLD_TOTAL_ALLOC_87_SIGN 123584 -#define MLD_TOTAL_ALLOC_87_VERIFY 99552 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 26912 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 48480 +#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 28480 +#define MLD_TOTAL_ALLOC_44_SIGN 44704 +#define MLD_TOTAL_ALLOC_44_VERIFY 24448 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 44320 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 74624 +#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 46720 +#define MLD_TOTAL_ALLOC_65_SIGN 69312 +#define MLD_TOTAL_ALLOC_65_VERIFY 39872 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 75040 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 115488 +#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 78272 +#define MLD_TOTAL_ALLOC_87_SIGN 108224 +#define MLD_TOTAL_ALLOC_87_VERIFY 68800 #else /* MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM */ -#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 32992 -#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 36192 -#define MLD_TOTAL_ALLOC_44_SIGN 32448 -#define MLD_TOTAL_ALLOC_44_VERIFY 22464 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 46304 -#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 50048 -#define MLD_TOTAL_ALLOC_65_SIGN 44768 -#define MLD_TOTAL_ALLOC_65_VERIFY 30720 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 62688 -#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 66336 -#define MLD_TOTAL_ALLOC_87_SIGN 59104 -#define MLD_TOTAL_ALLOC_87_VERIFY 41216 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT 11584 +#define MLD_TOTAL_ALLOC_44_KEYPAIR_PCT 16896 +#define MLD_TOTAL_ALLOC_44_PK_FROM_SK 13152 +#define MLD_TOTAL_ALLOC_44_SIGN 13120 +#define MLD_TOTAL_ALLOC_44_VERIFY 9120 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT 14656 +#define MLD_TOTAL_ALLOC_65_KEYPAIR_PCT 22560 +#define MLD_TOTAL_ALLOC_65_PK_FROM_SK 17056 +#define MLD_TOTAL_ALLOC_65_SIGN 17248 +#define MLD_TOTAL_ALLOC_65_VERIFY 10208 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT 18752 +#define MLD_TOTAL_ALLOC_87_KEYPAIR_PCT 28608 +#define MLD_TOTAL_ALLOC_87_PK_FROM_SK 21984 +#define MLD_TOTAL_ALLOC_87_SIGN 21344 +#define MLD_TOTAL_ALLOC_87_VERIFY 12512 #endif /* !(MLD_API_LEGACY_CONFIG || !MLD_CONFIG_REDUCE_RAM) */ /* check-magic: on */ @@ -957,19 +993,20 @@ int MLD_API_NAMESPACE(pk_from_sk)( #define MLD_MAX3_(a, b, c) \ ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c))) +#define MLD_MAX4_(a, b, c, d) MLD_MAX3_((a), (b), MLD_MAX3_((c), (d), (d))) /* - * `MLD_TOTAL_ALLOC_{44,65,87}` is the maximum across all operations for each - * parameter set. + * `MLD_TOTAL_ALLOC_{44,65,87}` is the maximum across standard API operations + * (keygen, sign, verify) for each parameter set. */ -#define MLD_TOTAL_ALLOC_44 \ - MLD_MAX3_(MLD_TOTAL_ALLOC_44_KEYPAIR, MLD_TOTAL_ALLOC_44_SIGN, \ - MLD_TOTAL_ALLOC_44_VERIFY) -#define MLD_TOTAL_ALLOC_65 \ - MLD_MAX3_(MLD_TOTAL_ALLOC_65_KEYPAIR, MLD_TOTAL_ALLOC_65_SIGN, \ - MLD_TOTAL_ALLOC_65_VERIFY) -#define MLD_TOTAL_ALLOC_87 \ - MLD_MAX3_(MLD_TOTAL_ALLOC_87_KEYPAIR, MLD_TOTAL_ALLOC_87_SIGN, \ - MLD_TOTAL_ALLOC_87_VERIFY) +#define MLD_TOTAL_ALLOC_44 \ + MLD_MAX4_(MLD_TOTAL_ALLOC_44_KEYPAIR, MLD_TOTAL_ALLOC_44_PK_FROM_SK, \ + MLD_TOTAL_ALLOC_44_SIGN, MLD_TOTAL_ALLOC_44_VERIFY) +#define MLD_TOTAL_ALLOC_65 \ + MLD_MAX4_(MLD_TOTAL_ALLOC_65_KEYPAIR, MLD_TOTAL_ALLOC_65_PK_FROM_SK, \ + MLD_TOTAL_ALLOC_65_SIGN, MLD_TOTAL_ALLOC_65_VERIFY) +#define MLD_TOTAL_ALLOC_87 \ + MLD_MAX4_(MLD_TOTAL_ALLOC_87_KEYPAIR, MLD_TOTAL_ALLOC_87_PK_FROM_SK, \ + MLD_TOTAL_ALLOC_87_SIGN, MLD_TOTAL_ALLOC_87_VERIFY) #endif /* !MLD_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c index 936f1c39b26..4abe796c5d8 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/mldsa_native_bcm.c @@ -65,30 +65,13 @@ #include "poly.c" #include "poly_kl.c" #include "polyvec.c" +#include "polyvec_lazy.c" #include "sign.c" #if defined(MLD_CONFIG_USE_NATIVE_BACKEND_ARITH) -#if defined(MLD_SYS_AARCH64) -#include "native/aarch64/src/aarch64_zetas.c" -#include "native/aarch64/src/polyz_unpack_table.c" -#include "native/aarch64/src/rej_uniform_eta_table.c" -#include "native/aarch64/src/rej_uniform_table.c" -#endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) #include "native/x86_64/src/consts.c" -#include "native/x86_64/src/poly_caddq_avx2.c" -#include "native/x86_64/src/poly_chknorm_avx2.c" -#include "native/x86_64/src/poly_decompose_32_avx2.c" -#include "native/x86_64/src/poly_decompose_88_avx2.c" -#include "native/x86_64/src/poly_use_hint_32_avx2.c" -#include "native/x86_64/src/poly_use_hint_88_avx2.c" -#include "native/x86_64/src/polyz_unpack_17_avx2.c" -#include "native/x86_64/src/polyz_unpack_19_avx2.c" -#include "native/x86_64/src/rej_uniform_avx2.c" -#include "native/x86_64/src/rej_uniform_eta2_avx2.c" -#include "native/x86_64/src/rej_uniform_eta4_avx2.c" -#include "native/x86_64/src/rej_uniform_table.c" #endif /* MLD_SYS_X86_64 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ @@ -158,8 +141,10 @@ #undef MLD_ERR_FAIL #undef MLD_ERR_OUT_OF_MEMORY #undef MLD_ERR_RNG_FAIL +#undef MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED #undef MLD_H #undef MLD_MAX3_ +#undef MLD_MAX4_ #undef MLD_PREHASH_NONE #undef MLD_PREHASH_SHA2_224 #undef MLD_PREHASH_SHA2_256 @@ -177,18 +162,21 @@ #undef MLD_TOTAL_ALLOC_44_KEYPAIR #undef MLD_TOTAL_ALLOC_44_KEYPAIR_NO_PCT #undef MLD_TOTAL_ALLOC_44_KEYPAIR_PCT +#undef MLD_TOTAL_ALLOC_44_PK_FROM_SK #undef MLD_TOTAL_ALLOC_44_SIGN #undef MLD_TOTAL_ALLOC_44_VERIFY #undef MLD_TOTAL_ALLOC_65 #undef MLD_TOTAL_ALLOC_65_KEYPAIR #undef MLD_TOTAL_ALLOC_65_KEYPAIR_NO_PCT #undef MLD_TOTAL_ALLOC_65_KEYPAIR_PCT +#undef MLD_TOTAL_ALLOC_65_PK_FROM_SK #undef MLD_TOTAL_ALLOC_65_SIGN #undef MLD_TOTAL_ALLOC_65_VERIFY #undef MLD_TOTAL_ALLOC_87 #undef MLD_TOTAL_ALLOC_87_KEYPAIR #undef MLD_TOTAL_ALLOC_87_KEYPAIR_NO_PCT #undef MLD_TOTAL_ALLOC_87_KEYPAIR_PCT +#undef MLD_TOTAL_ALLOC_87_PK_FROM_SK #undef MLD_TOTAL_ALLOC_87_SIGN #undef MLD_TOTAL_ALLOC_87_VERIFY #undef crypto_sign @@ -200,6 +188,7 @@ #undef MLD_ADD_PARAM_SET #undef MLD_ALLOC #undef MLD_APPLY +#undef MLD_ASM_FN_SIZE #undef MLD_ASM_FN_SYMBOL #undef MLD_ASM_NAMESPACE #undef MLD_BUILD_INTERNAL @@ -220,27 +209,30 @@ #undef MLD_ERR_FAIL #undef MLD_ERR_OUT_OF_MEMORY #undef MLD_ERR_RNG_FAIL +#undef MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED #undef MLD_EXTERNAL_API #undef MLD_FIPS202X4_HEADER_FILE #undef MLD_FIPS202_HEADER_FILE #undef MLD_FREE #undef MLD_INTERNAL_API +#undef MLD_INTERNAL_DATA_DECLARATION +#undef MLD_INTERNAL_DATA_DEFINITION #undef MLD_MULTILEVEL_BUILD #undef MLD_NAMESPACE #undef MLD_NAMESPACE_KL #undef MLD_NAMESPACE_PREFIX #undef MLD_NAMESPACE_PREFIX_KL -#undef MLK_UNION_OR_STRUCT #undef mld_memcpy #undef mld_memset /* mldsa/src/packing.h */ #undef MLD_PACKING_H -#undef mld_pack_pk -#undef mld_pack_sig_c_h +#undef mld_pack_sig_c +#undef mld_pack_sig_h #undef mld_pack_sig_z -#undef mld_pack_sk -#undef mld_unpack_pk -#undef mld_unpack_sig +#undef mld_pack_sk_rho_key_tr_s2 +#undef mld_pack_sk_s1 +#undef mld_sig_unpack_hints +#undef mld_unpack_pk_t1 #undef mld_unpack_sk /* mldsa/src/params.h */ #undef MLDSA_BETA @@ -275,7 +267,6 @@ #undef MLD_POLY_KL_H #undef mld_poly_challenge #undef mld_poly_decompose -#undef mld_poly_make_hint #undef mld_poly_uniform_eta #undef mld_poly_uniform_eta_4x #undef mld_poly_uniform_gamma1 @@ -288,29 +279,16 @@ #undef mld_polyz_unpack /* mldsa/src/polyvec.h */ #undef MLD_POLYVEC_H -#undef mld_polymat -#undef mld_polymat_get_row -#undef mld_polyvec_matrix_expand -#undef mld_polyvec_matrix_pointwise_montgomery #undef mld_polyveck -#undef mld_polyveck_add #undef mld_polyveck_caddq #undef mld_polyveck_chknorm #undef mld_polyveck_decompose #undef mld_polyveck_invntt_tomont -#undef mld_polyveck_make_hint #undef mld_polyveck_ntt #undef mld_polyveck_pack_eta -#undef mld_polyveck_pack_t0 #undef mld_polyveck_pack_w1 -#undef mld_polyveck_pointwise_poly_montgomery -#undef mld_polyveck_power2round #undef mld_polyveck_reduce -#undef mld_polyveck_shiftl -#undef mld_polyveck_sub #undef mld_polyveck_unpack_eta -#undef mld_polyveck_unpack_t0 -#undef mld_polyveck_use_hint #undef mld_polyvecl #undef mld_polyvecl_chknorm #undef mld_polyvecl_ntt @@ -319,6 +297,58 @@ #undef mld_polyvecl_uniform_gamma1 #undef mld_polyvecl_unpack_eta #undef mld_polyvecl_unpack_z +/* mldsa/src/polyvec_lazy.h */ +#undef MLD_POLYVEC_LAZY_H +#undef mld_poly_permute_bitrev_to_custom_optional +#undef mld_polymat +#undef mld_polymat_eager +#undef mld_polymat_lazy +#undef mld_polyvec_matrix_expand +#undef mld_polyvec_matrix_expand_eager +#undef mld_polyvec_matrix_expand_lazy +#undef mld_polyvec_matrix_pointwise_montgomery +#undef mld_polyvec_matrix_pointwise_montgomery_row +#undef mld_polyvec_matrix_pointwise_montgomery_row_eager +#undef mld_polyvec_matrix_pointwise_montgomery_row_lazy +#undef mld_polyvec_matrix_pointwise_montgomery_yvec +#undef mld_polyvec_matrix_pointwise_montgomery_yvec_eager +#undef mld_polyvec_matrix_pointwise_montgomery_yvec_lazy +#undef mld_sk_s1hat +#undef mld_sk_s1hat_eager +#undef mld_sk_s1hat_get_poly +#undef mld_sk_s1hat_get_poly_eager +#undef mld_sk_s1hat_get_poly_lazy +#undef mld_sk_s1hat_lazy +#undef mld_sk_s2hat +#undef mld_sk_s2hat_eager +#undef mld_sk_s2hat_get_poly +#undef mld_sk_s2hat_get_poly_eager +#undef mld_sk_s2hat_get_poly_lazy +#undef mld_sk_s2hat_lazy +#undef mld_sk_t0hat +#undef mld_sk_t0hat_eager +#undef mld_sk_t0hat_get_poly +#undef mld_sk_t0hat_get_poly_eager +#undef mld_sk_t0hat_get_poly_lazy +#undef mld_sk_t0hat_lazy +#undef mld_unpack_sk_s1hat +#undef mld_unpack_sk_s1hat_eager +#undef mld_unpack_sk_s1hat_lazy +#undef mld_unpack_sk_s2hat +#undef mld_unpack_sk_s2hat_eager +#undef mld_unpack_sk_s2hat_lazy +#undef mld_unpack_sk_t0hat +#undef mld_unpack_sk_t0hat_eager +#undef mld_unpack_sk_t0hat_lazy +#undef mld_yvec +#undef mld_yvec_eager +#undef mld_yvec_get_poly +#undef mld_yvec_get_poly_eager +#undef mld_yvec_get_poly_lazy +#undef mld_yvec_init +#undef mld_yvec_init_eager +#undef mld_yvec_init_lazy +#undef mld_yvec_lazy /* mldsa/src/rounding.h */ #undef MLD_2_POW_D #undef MLD_ROUNDING_H @@ -443,6 +473,7 @@ #undef MLD_SYS_AARCH64 #undef MLD_SYS_AARCH64_EB #undef MLD_SYS_APPLE +#undef MLD_SYS_ARMV81M_MVE #undef MLD_SYS_BIG_ENDIAN #undef MLD_SYS_H #undef MLD_SYS_LINUX @@ -450,6 +481,7 @@ #undef MLD_SYS_PPC64LE #undef MLD_SYS_RISCV32 #undef MLD_SYS_RISCV64 +#undef MLD_SYS_RISCV64_RVV #undef MLD_SYS_WINDOWS #undef MLD_SYS_X86_64 #undef MLD_SYS_X86_64_AVX2 @@ -466,63 +498,9 @@ #undef MLD_NATIVE_FUNC_FALLBACK #undef MLD_NATIVE_FUNC_SUCCESS #undef MLD_NTT_BOUND -#undef REDUCE32_RANGE_MAX +#undef MLD_REDUCE32_RANGE_MAX /* mldsa/src/native/meta.h */ #undef MLD_NATIVE_META_H -#if defined(MLD_SYS_AARCH64) -/* - * Undefine macros from native code (Arith, AArch64) - */ -/* mldsa/src/native/aarch64/meta.h */ -#undef MLD_ARITH_BACKEND_AARCH64 -#undef MLD_NATIVE_AARCH64_META_H -#undef MLD_USE_NATIVE_INTT -#undef MLD_USE_NATIVE_NTT -#undef MLD_USE_NATIVE_POINTWISE_MONTGOMERY -#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 -#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 -#undef MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 -#undef MLD_USE_NATIVE_POLYZ_UNPACK_17 -#undef MLD_USE_NATIVE_POLYZ_UNPACK_19 -#undef MLD_USE_NATIVE_POLY_CADDQ -#undef MLD_USE_NATIVE_POLY_CHKNORM -#undef MLD_USE_NATIVE_POLY_DECOMPOSE_32 -#undef MLD_USE_NATIVE_POLY_DECOMPOSE_88 -#undef MLD_USE_NATIVE_POLY_USE_HINT_32 -#undef MLD_USE_NATIVE_POLY_USE_HINT_88 -#undef MLD_USE_NATIVE_REJ_UNIFORM -#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA2 -#undef MLD_USE_NATIVE_REJ_UNIFORM_ETA4 -/* mldsa/src/native/aarch64/src/arith_native_aarch64.h */ -#undef MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN -#undef MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN -#undef MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H -#undef mld_aarch64_intt_zetas_layer123456 -#undef mld_aarch64_intt_zetas_layer78 -#undef mld_aarch64_ntt_zetas_layer123456 -#undef mld_aarch64_ntt_zetas_layer78 -#undef mld_intt_asm -#undef mld_ntt_asm -#undef mld_poly_caddq_asm -#undef mld_poly_chknorm_asm -#undef mld_poly_decompose_32_asm -#undef mld_poly_decompose_88_asm -#undef mld_poly_pointwise_montgomery_asm -#undef mld_poly_use_hint_32_asm -#undef mld_poly_use_hint_88_asm -#undef mld_polyvecl_pointwise_acc_montgomery_l4_asm -#undef mld_polyvecl_pointwise_acc_montgomery_l5_asm -#undef mld_polyvecl_pointwise_acc_montgomery_l7_asm -#undef mld_polyz_unpack_17_asm -#undef mld_polyz_unpack_17_indices -#undef mld_polyz_unpack_19_asm -#undef mld_polyz_unpack_19_indices -#undef mld_rej_uniform_asm -#undef mld_rej_uniform_eta2_asm -#undef mld_rej_uniform_eta4_asm -#undef mld_rej_uniform_eta_table -#undef mld_rej_uniform_table -#endif /* MLD_SYS_AARCH64 */ #if defined(MLD_SYS_X86_64) /* * Undefine macros from native code (Arith, X86_64) @@ -553,14 +531,14 @@ #undef MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN #undef MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN #undef MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H -#undef mld_invntt_avx2 -#undef mld_ntt_avx2 -#undef mld_nttunpack_avx2 -#undef mld_pointwise_acc_l4_avx2 -#undef mld_pointwise_acc_l5_avx2 -#undef mld_pointwise_acc_l7_avx2 -#undef mld_pointwise_avx2 -#undef mld_poly_caddq_avx2 +#undef mld_invntt_avx2_asm +#undef mld_ntt_avx2_asm +#undef mld_nttunpack_avx2_asm +#undef mld_pointwise_acc_l4_avx2_asm +#undef mld_pointwise_acc_l5_avx2_asm +#undef mld_pointwise_acc_l7_avx2_asm +#undef mld_pointwise_avx2_asm +#undef mld_poly_caddq_avx2_asm #undef mld_poly_chknorm_avx2 #undef mld_poly_decompose_32_avx2 #undef mld_poly_decompose_88_avx2 diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/api.h b/crypto/fipsmodule/ml_dsa/mldsa/native/api.h new file mode 100644 index 00000000000..2b603b387dd --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/api.h @@ -0,0 +1,609 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLD_NATIVE_API_H +#define MLD_NATIVE_API_H +/* + * Native arithmetic interface + * + * This header is primarily for documentation purposes. + * It should not be included by backend implementations. + * + * To ensure consistency with backends, the header will be + * included automatically after inclusion of the active + * backend, to ensure consistency of function signatures, + * and run sanity checks. + */ + +#include "../cbmc.h" +#include "../common.h" + +/* Backends must return MLD_NATIVE_FUNC_SUCCESS upon success. */ +#define MLD_NATIVE_FUNC_SUCCESS (0) +/* Backends may return MLD_NATIVE_FUNC_FALLBACK to signal to the frontend that + * the target/parameters are unsupported; typically, this would be because of + * dependencies on CPU features not detected on the host CPU. In this case, + * the frontend falls back to the default C implementation. + * + * IMPORTANT: Backend implementations must ensure that the decision of whether + * to fallback (return MLD_NATIVE_FUNC_FALLBACK) or not must never depend on + * the input data itself. Fallback decisions may only depend on system + * capabilities (e.g., CPU features) and, where present, length information. + * This requirement applies to all backend functions to maintain constant-time + * properties. + */ +#define MLD_NATIVE_FUNC_FALLBACK (-1) + +/* Bound on absolute value of coefficients after NTT. + * + * NOTE: This is the same bound as in poly.h and has to be kept + * in sync. */ +#define MLD_NTT_BOUND (9 * MLDSA_Q) + +/* Absolute exclusive upper bound for the output of the inverse NTT + * + * NOTE: This is the same bound as in poly.h and has to be kept + * in sync. */ +#define MLD_INTT_BOUND MLDSA_Q + +/* Absolute bound for range of mld_reduce32() + * + * NOTE: This is the same bound as in reduce.h and has to be kept + * in sync. */ +/* check-magic: 6283009 == (MLD_REDUCE32_DOMAIN_MAX - 255 * MLDSA_Q + 1) */ +#define MLD_REDUCE32_RANGE_MAX 6283009 +/* + * This is the C<->native interface allowing for the drop-in of + * native code for performance-critical arithmetic components of ML-DSA. + * + * A _backend_ is a specific implementation of (part of) this interface. + * + * To add a function to a backend, define MLD_USE_NATIVE_XXX and + * implement `static inline xxx(...)` in the profile header. + */ + +/* + * Those functions are meant to be trivial wrappers around the chosen native + * implementation. The are static inline to avoid unnecessary calls. + * The macro before each declaration controls whether a native + * implementation is present. + */ + +#if defined(MLD_USE_NATIVE_NTT) +/** + * Computes negacyclic number-theoretic transform (NTT) of a polynomial + * in place. + * + * The input polynomial is assumed to be in normal order. The output + * polynomial is in bitreversed order. + * + * @param[in,out] p Pointer to in/output polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_ntt_native(int32_t p[MLDSA_N]) +__contract__( + requires(memory_no_alias(p, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(p, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(p, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLDSA_N, MLD_NTT_BOUND)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLDSA_N)) +); +#endif /* MLD_USE_NATIVE_NTT */ + + +#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER) +/* + * This must only be set if NTT and INTT have native implementations + * that are adapted to the custom order. + */ +#if !defined(MLD_USE_NATIVE_NTT) || !defined(MLD_USE_NATIVE_INTT) +#error \ + "Invalid native profile: MLD_USE_NATIVE_NTT_CUSTOM_ORDER can only be \ +set if there are native implementations for NTT and INTT." +#endif + +/** + * When MLD_USE_NATIVE_NTT_CUSTOM_ORDER is defined, convert a polynomial in + * NTT domain from bitreversed order to the custom order output by the native + * NTT. + * + * This must only be defined if there is native code for both the NTT and + * INTT. + * + * @param[in,out] p Pointer to in/output polynomial. + */ +static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N]) +__contract__( + /* We don't specify that this should be a permutation, but only + * that it does not change the bound established at the end of + * mld_polyvec_matrix_expand. + */ + requires(memory_no_alias(p, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(p, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(p, sizeof(int32_t) * MLDSA_N)) + ensures(array_bound(p, 0, MLDSA_N, 0, MLDSA_Q))); +#endif /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */ + + +#if defined(MLD_USE_NATIVE_INTT) +/** + * Computes inverse of negacyclic number-theoretic transform (NTT) of a + * polynomial in place. + * + * The input polynomial is in bitreversed order. The output polynomial is + * assumed to be in normal order. + * + * @param[in,out] p Pointer to in/output polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_intt_native(int32_t p[MLDSA_N]) +__contract__( + requires(memory_no_alias(p, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(p, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(p, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLDSA_N, MLD_INTT_BOUND)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLDSA_N)) +); +#endif /* MLD_USE_NATIVE_INTT */ + +#if defined(MLD_USE_NATIVE_REJ_UNIFORM) +/** + * Run rejection sampling on uniform random bytes to generate uniform random + * integers in [0, MLDSA_Q-1]. + * + * @param[out] r Pointer to output buffer. + * @param len Requested number of 32-bit integers (uniform mod + * MLDSA_Q). + * @param[in] buf Pointer to input buffer (assumed to be uniform random + * bytes). + * @param buflen Length of input buffer in bytes. + * + * @return - MLD_NATIVE_FUNC_FALLBACK if the native implementation does not + * support the input lengths. + * - Otherwise, the non-negative number of sampled 32-bit integers + * (at most len). + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +__contract__( + requires(len <= MLDSA_N) + requires(buflen <= ( 5 * 168) && buflen % 3 == 0) + requires(memory_no_alias(r, sizeof(int32_t) * len)) + requires(memory_no_alias(buf, buflen)) + assigns(memory_slice(r, sizeof(int32_t) * len)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || (0 <= return_value && return_value <= len)) + ensures((return_value != MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(r, 0, (unsigned) return_value, 0, MLDSA_Q)) +); +#endif /* MLD_USE_NATIVE_REJ_UNIFORM */ + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +#if defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA2) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_ETA == 2 +/** + * Run rejection sampling on uniform random bytes to generate uniform random + * integers in [-2, +2]. + * + * @param[out] r Pointer to output buffer. + * @param len Requested number of 32-bit integers (uniform in + * [-2, +2]). + * @param[in] buf Pointer to input buffer (assumed to be uniform random + * bytes). + * @param buflen Length of input buffer in bytes. + * + * @return - MLD_NATIVE_FUNC_FALLBACK if the native implementation does not + * support the input lengths. + * - Otherwise, the non-negative number of sampled 32-bit integers + * (at most len). + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +__contract__( + requires(len <= MLDSA_N) + requires(buflen <= (2 * 136)) + requires(memory_no_alias(r, sizeof(int32_t) * len)) + requires(memory_no_alias(buf, buflen)) + assigns(memory_slice(r, sizeof(int32_t) * len)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || (0 <= return_value && return_value <= len)) + ensures((return_value != MLD_NATIVE_FUNC_FALLBACK) ==> (array_abs_bound(r, 0, return_value, MLDSA_ETA + 1))) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_ETA == 2 */ +#endif /* MLD_USE_NATIVE_REJ_UNIFORM_ETA2 */ + +#if defined(MLD_USE_NATIVE_REJ_UNIFORM_ETA4) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_ETA == 4 +/** + * Run rejection sampling on uniform random bytes to generate uniform random + * integers in [-4, +4]. + * + * @param[out] r Pointer to output buffer. + * @param len Requested number of 32-bit integers (uniform in + * [-4, +4]). + * @param[in] buf Pointer to input buffer (assumed to be uniform random + * bytes). + * @param buflen Length of input buffer in bytes. + * + * @return - MLD_NATIVE_FUNC_FALLBACK if the native implementation does not + * support the input lengths. + * - Otherwise, the non-negative number of sampled 32-bit integers + * (at most len). + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, + const uint8_t *buf, + unsigned buflen) +__contract__( + requires(len <= MLDSA_N) + requires(buflen <= (2 * 136)) + requires(memory_no_alias(r, sizeof(int32_t) * len)) + requires(memory_no_alias(buf, buflen)) + assigns(memory_slice(r, sizeof(int32_t) * len)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || (0 <= return_value && return_value <= len)) + ensures((return_value != MLD_NATIVE_FUNC_FALLBACK) ==> (array_abs_bound(r, 0, return_value, MLDSA_ETA + 1))) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_ETA == 4 */ +#endif /* MLD_USE_NATIVE_REJ_UNIFORM_ETA4 */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/** + * Native implementation of poly_decompose for GAMMA2 = (MLDSA_Q-1)/32. + * + * For all coefficients c of the input polynomial, compute high and low bits + * c0, c1 such c mod MLDSA_Q = c1*(2*GAMMA2) + c0 with + * -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except c1 = (MLDSA_Q-1)/(2*GAMMA2) where + * we set c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. + * Assumes coefficients to be standard representatives. + * + * @param[out] a1 Output polynomial with coefficients c1. + * @param[in,out] a0 Input/output polynomial. Output has coefficients c0. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_decompose_32_native(int32_t *a1, int32_t *a0) +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a1, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a0, 0, MLDSA_N, MLDSA_GAMMA2+1)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a0, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ +#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_32 */ + +#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/** + * Native implementation of poly_decompose for GAMMA2 = (MLDSA_Q-1)/88. + * + * For all coefficients c of the input polynomial, compute high and low bits + * c0, c1 such c mod MLDSA_Q = c1*(2*GAMMA2) + c0 with + * -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except c1 = (MLDSA_Q-1)/(2*GAMMA2) where + * we set c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. + * Assumes coefficients to be standard representatives. + * + * @param[out] a1 Output polynomial with coefficients c1. + * @param[in,out] a0 Input/output polynomial. Output has coefficients c0. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_decompose_88_native(int32_t *a1, int32_t *a0) +__contract__( + requires(memory_no_alias(a1, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a0, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(a1, sizeof(int32_t) * MLDSA_N)) + assigns(memory_slice(a0, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a1, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a0, 0, MLDSA_N, MLDSA_GAMMA2+1)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_bound(a0, 0, MLDSA_N, 0, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a0, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ +#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_88 */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#if defined(MLD_USE_NATIVE_POLY_CADDQ) +/** + * For all coefficients of in/out polynomial add Q if coefficient is negative. + * + * @param[in,out] a Pointer to input/output polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N]) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a, MLDSA_N)) +); +#endif /* MLD_USE_NATIVE_POLY_CADDQ */ + +#if !defined(MLD_CONFIG_NO_VERIFY_API) +#if defined(MLD_USE_NATIVE_POLY_USE_HINT_32) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/** + * Native implementation of poly_use_hint for GAMMA2 = (MLDSA_Q-1)/32. + * + * Use hint h to correct the high bits of a in-place. + * + * @param[in,out] a Input/output polynomial. + * @param[in] h Hint polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_use_hint_32_native(int32_t *a, const int32_t *h) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(h, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(h, 0, MLDSA_N, 0, 2)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ +#endif /* MLD_USE_NATIVE_POLY_USE_HINT_32 */ + +#if defined(MLD_USE_NATIVE_POLY_USE_HINT_88) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/** + * Native implementation of poly_use_hint for GAMMA2 = (MLDSA_Q-1)/88. + * + * Use hint h to correct the high bits of a in-place. + * + * @param[in,out] a Input/output polynomial. + * @param[in] h Hint polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_use_hint_88_native(int32_t *a, const int32_t *h) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(h, sizeof(int32_t) * MLDSA_N)) + requires(array_bound(a, 0, MLDSA_N, 0, MLDSA_Q)) + requires(array_bound(h, 0, MLDSA_N, 0, 2)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(a, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ +#endif /* MLD_USE_NATIVE_POLY_USE_HINT_88 */ +#endif /* !MLD_CONFIG_NO_VERIFY_API */ + +#if defined(MLD_USE_NATIVE_POLY_CHKNORM) +/** + * Check infinity norm of polynomial against given bound. Assumes input + * coefficients were reduced by mld_reduce32(). + * + * @param[in] a Pointer to polynomial. + * @param B Norm bound, which must be in the range + * 0 .. MLDSA_Q - MLD_REDUCE32_RANGE_MAX inclusive. + * + * @return - MLD_NATIVE_FUNC_FALLBACK if the target CPU cannot support a + * native implementation of this function. + * - MLD_NATIVE_FUNC_SUCCESS if the infinity norm is strictly smaller + * than B. + * - 1 otherwise. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(0 <= B && B <= MLDSA_Q - MLD_REDUCE32_RANGE_MAX) + requires(array_bound(a, 0, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == 0 || + return_value == 1) + ensures((return_value != MLD_NATIVE_FUNC_FALLBACK) ==> + ((return_value == 0) == array_abs_bound(a, 0, MLDSA_N, B))) +); +#endif /* MLD_USE_NATIVE_POLY_CHKNORM */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) +#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_17) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLD_CONFIG_PARAMETER_SET == 44 +/** + * Native implementation of polyz_unpack for GAMMA1 = 2^17. + * + * Unpack polynomial z with coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. + * + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, MLDSA_POLYZ_PACKEDBYTES)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(r, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ + */ +#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_17 */ + +#if defined(MLD_USE_NATIVE_POLYZ_UNPACK_19) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87) +/** + * Native implementation of polyz_unpack for GAMMA1 = 2^19. + * + * Unpack polynomial z with coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. + * + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, MLDSA_POLYZ_PACKEDBYTES)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_bound(r, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(r, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87 */ +#endif /* MLD_USE_NATIVE_POLYZ_UNPACK_19 */ +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +#if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) +/** + * Pointwise multiplication of polynomials in NTT domain with Montgomery + * reduction. Destructive in the first argument. + * + * Computes a[i] = a[i] * b[i] * R^(-1) mod MLDSA_Q for all i, where R = 2^32. + * + * @param[in,out] a First input/output polynomial. + * @param[in] b Second input polynomial. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_pointwise_montgomery_native( + int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND)) + requires(array_abs_bound(b, 0, MLDSA_N, MLD_NTT_BOUND)) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(a, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(a, 0, MLDSA_N, MLD_NTT_BOUND)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(b, 0, MLDSA_N, MLD_NTT_BOUND)) +); +#endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */ +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 4 +/** + * Native implementation of polyvecl_pointwise_acc_montgomery for MLDSA_L = 4. + * + * Pointwise multiply vectors of polynomials of length MLDSA_L, multiply + * resulting vector by 2^{-32} and add (accumulate) polynomials in it. + * Input/output vectors are in NTT domain representation. + * + * @param[out] w Output polynomial. + * @param[in] u First input vector. + * @param[in] v Second input vector. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l4_native( + int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N], + const int32_t v[4][MLDSA_N]) +__contract__( + requires(memory_no_alias(w, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(u, sizeof(int32_t) * 4 * MLDSA_N)) + requires(memory_no_alias(v, sizeof(int32_t) * 4 * MLDSA_N)) + requires(forall(l0, 0, 4, + array_bound(u[l0], 0, MLDSA_N, 0, MLDSA_Q))) + requires(forall(l1, 0, 4, + array_abs_bound(v[l1], 0, MLDSA_N, MLD_NTT_BOUND))) + assigns(memory_slice(w, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(w, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(w, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 4 */ +#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 */ + +#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5 +/** + * Native implementation of polyvecl_pointwise_acc_montgomery for MLDSA_L = 5. + * + * Pointwise multiply vectors of polynomials of length MLDSA_L, multiply + * resulting vector by 2^{-32} and add (accumulate) polynomials in it. + * Input/output vectors are in NTT domain representation. + * + * @param[out] w Output polynomial. + * @param[in] u First input vector. + * @param[in] v Second input vector. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l5_native( + int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N], + const int32_t v[5][MLDSA_N]) +__contract__( + requires(memory_no_alias(w, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(u, sizeof(int32_t) * 5 * MLDSA_N)) + requires(memory_no_alias(v, sizeof(int32_t) * 5 * MLDSA_N)) + requires(forall(l0, 0, 5, + array_bound(u[l0], 0, MLDSA_N, 0, MLDSA_Q))) + requires(forall(l1, 0, 5, + array_abs_bound(v[l1], 0, MLDSA_N, MLD_NTT_BOUND))) + assigns(memory_slice(w, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(w, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(w, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5 */ +#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 */ + +#if defined(MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7) +#if defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 7 +/** + * Native implementation of polyvecl_pointwise_acc_montgomery for MLDSA_L = 7. + * + * Pointwise multiply vectors of polynomials of length MLDSA_L, multiply + * resulting vector by 2^{-32} and add (accumulate) polynomials in it. + * Input/output vectors are in NTT domain representation. + * + * @param[out] w Output polynomial. + * @param[in] u First input vector. + * @param[in] v Second input vector. + */ +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_polyvecl_pointwise_acc_montgomery_l7_native( + int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N], + const int32_t v[7][MLDSA_N]) +__contract__( + requires(memory_no_alias(w, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(u, sizeof(int32_t) * 7 * MLDSA_N)) + requires(memory_no_alias(v, sizeof(int32_t) * 7 * MLDSA_N)) + requires(forall(l0, 0, 7, + array_bound(u[l0], 0, MLDSA_N, 0, MLDSA_Q))) + requires(forall(l1, 0, 7, + array_abs_bound(v[l1], 0, MLDSA_N, MLD_NTT_BOUND))) + assigns(memory_slice(w, sizeof(int32_t) * MLDSA_N)) + ensures(return_value == MLD_NATIVE_FUNC_FALLBACK || return_value == MLD_NATIVE_FUNC_SUCCESS) + ensures((return_value == MLD_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(w, 0, MLDSA_N, MLDSA_Q)) + ensures((return_value == MLD_NATIVE_FUNC_FALLBACK) ==> array_unchanged(w, MLDSA_N)) +); +#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 7 */ +#endif /* MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 */ + +#endif /* !MLD_NATIVE_API_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/arith_native_x86_64.h b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/arith_native_x86_64.h new file mode 100644 index 00000000000..7c2975e5e63 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/arith_native_x86_64.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H +#define MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H +#include "../../../common.h" + +#include "consts.h" + +#define MLD_AVX2_REJ_UNIFORM_BUFLEN \ + (5 * 168) /* REJ_UNIFORM_NBLOCKS * SHAKE128_RATE */ + + +/* + * Sampling 256 coefficients mod 15 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/15))/2 = 136.5 bytes. + * We sample 1 block (=136 bytes) of SHAKE256_RATE output initially. + * Sampling 2 blocks initially results in slightly worse performance. + */ +#define MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN (1 * 136) + +/* + * Sampling 256 coefficients mod 9 using rejection sampling from 4 bits. + * Expected number of required bytes: (256 * (16/9))/2 = 227.5 bytes. + * We sample 2 blocks (=272 bytes) of SHAKE256_RATE output initially. + */ +#define MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN (2 * 136) + +#define mld_rej_uniform_table MLD_NAMESPACE(mld_rej_uniform_table) +MLD_INTERNAL_DATA_DECLARATION const uint8_t mld_rej_uniform_table[256][8]; + +#define mld_ntt_avx2_asm MLD_NAMESPACE(ntt_avx2_asm) +void mld_ntt_avx2_asm(int32_t *r, const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/ntt_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, 8380417)) + requires(qdata == mld_qdata) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + /* check-magic: off */ + ensures(array_abs_bound(r, 0, MLDSA_N, 42035262)) + /* check-magic: on */ +); + +#define mld_invntt_avx2_asm MLD_NAMESPACE(invntt_avx2_asm) +void mld_invntt_avx2_asm(int32_t *r, const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/intt_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, 8380417)) + requires(qdata == mld_qdata) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + /* check-magic: off */ + ensures(array_abs_bound(r, 0, MLDSA_N, 6285313)) + /* check-magic: on */ +); + +#define mld_nttunpack_avx2_asm MLD_NAMESPACE(nttunpack_avx2_asm) +void mld_nttunpack_avx2_asm(int32_t *r) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/nttunpack_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(array_abs_bound(r, 0, MLDSA_N, 8380417)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + /* Output is a permutation of input: every output coefficient + * is some input coefficient */ + ensures(forall(i, 0, MLDSA_N, exists(j, 0, MLDSA_N, + r[i] == old(*(int32_t (*)[MLDSA_N])r)[j]))) +); + +#define mld_rej_uniform_avx2 MLD_NAMESPACE(mld_rej_uniform_avx2) +MLD_MUST_CHECK_RETURN_VALUE +unsigned mld_rej_uniform_avx2(int32_t *r, + const uint8_t buf[MLD_AVX2_REJ_UNIFORM_BUFLEN]); + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +#define mld_rej_uniform_eta2_avx2 MLD_NAMESPACE(mld_rej_uniform_eta2_avx2) +MLD_MUST_CHECK_RETURN_VALUE +unsigned mld_rej_uniform_eta2_avx2( + int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA2_BUFLEN]); + +#define mld_rej_uniform_eta4_avx2 MLD_NAMESPACE(mld_rej_uniform_eta4_avx2) +MLD_MUST_CHECK_RETURN_VALUE +unsigned mld_rej_uniform_eta4_avx2( + int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) +void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0); + +#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) +void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0); +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#define mld_poly_caddq_avx2_asm MLD_NAMESPACE(poly_caddq_avx2_asm) +void mld_poly_caddq_avx2_asm(int32_t *r); + +#if !defined(MLD_CONFIG_NO_VERIFY_API) +#define mld_poly_use_hint_32_avx2 MLD_NAMESPACE(mld_poly_use_hint_32_avx2) +void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *h); + +#define mld_poly_use_hint_88_avx2 MLD_NAMESPACE(mld_poly_use_hint_88_avx2) +void mld_poly_use_hint_88_avx2(int32_t *a, const int32_t *h); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ + +#define mld_poly_chknorm_avx2 MLD_NAMESPACE(mld_poly_chknorm_avx2) +MLD_MUST_CHECK_RETURN_VALUE +int mld_poly_chknorm_avx2(const int32_t *a, int32_t B); + +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) +#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2) +void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a); + +#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) +void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ + +#define mld_pointwise_avx2_asm MLD_NAMESPACE(pointwise_avx2_asm) +void mld_pointwise_avx2_asm(int32_t *a, const int32_t *b, const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/pointwise_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(a, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * MLDSA_N)) + /* check-magic: off */ + requires(array_abs_bound(a, 0, MLDSA_N, 75423753)) + requires(array_abs_bound(b, 0, MLDSA_N, 75423753)) + requires(qdata == mld_qdata) + assigns(memory_slice(a, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(a, 0, MLDSA_N, 8380417)) + /* check-magic: on */ +); + +#define mld_pointwise_acc_l4_avx2_asm MLD_NAMESPACE(pointwise_acc_l4_avx2_asm) +void mld_pointwise_acc_l4_avx2_asm(int32_t c[MLDSA_N], + const int32_t a[4][MLDSA_N], + const int32_t b[4][MLDSA_N], + const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/pointwise_acc_l4_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, sizeof(int32_t) * 4 * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * 4 * MLDSA_N)) + /* check-magic: off */ + requires(forall(l0, 0, 4, array_abs_bound(a[l0], 0, MLDSA_N, 8380417))) + requires(forall(l1, 0, 4, array_abs_bound(b[l1], 0, MLDSA_N, 75423753))) + requires(qdata == mld_qdata) + assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(c, 0, MLDSA_N, 8380417)) + /* check-magic: on */ +); + +#define mld_pointwise_acc_l5_avx2_asm MLD_NAMESPACE(pointwise_acc_l5_avx2_asm) +void mld_pointwise_acc_l5_avx2_asm(int32_t c[MLDSA_N], + const int32_t a[5][MLDSA_N], + const int32_t b[5][MLDSA_N], + const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/pointwise_acc_l5_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, sizeof(int32_t) * 5 * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * 5 * MLDSA_N)) + /* check-magic: off */ + requires(forall(l0, 0, 5, array_abs_bound(a[l0], 0, MLDSA_N, 8380417))) + requires(forall(l1, 0, 5, array_abs_bound(b[l1], 0, MLDSA_N, 75423753))) + requires(qdata == mld_qdata) + assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(c, 0, MLDSA_N, 8380417)) + /* check-magic: on */ +); + +#define mld_pointwise_acc_l7_avx2_asm MLD_NAMESPACE(pointwise_acc_l7_avx2_asm) +void mld_pointwise_acc_l7_avx2_asm(int32_t c[MLDSA_N], + const int32_t a[7][MLDSA_N], + const int32_t b[7][MLDSA_N], + const int32_t *qdata) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/pointwise_acc_l7_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(c, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, sizeof(int32_t) * 7 * MLDSA_N)) + requires(memory_no_alias(b, sizeof(int32_t) * 7 * MLDSA_N)) + /* check-magic: off */ + requires(forall(l0, 0, 7, array_abs_bound(a[l0], 0, MLDSA_N, 8380417))) + requires(forall(l1, 0, 7, array_abs_bound(b[l1], 0, MLDSA_N, 75423753))) + requires(qdata == mld_qdata) + assigns(memory_slice(c, sizeof(int32_t) * MLDSA_N)) + ensures(array_abs_bound(c, 0, MLDSA_N, 8380417)) + /* check-magic: on */ +); + +#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.c b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.c new file mode 100644 index 00000000000..03772d7f637 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +/* + * Table of zeta values used in the AVX2 forward and inverse NTT + * See autogen for details. + */ +MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const int32_t mld_qdata[624] = { + 8380417, 8380417, 8380417, 8380417, 8380417, + 8380417, 8380417, 8380417, 58728449, 58728449, + 58728449, 58728449, 58728449, 58728449, 58728449, + 58728449, -8395782, -8395782, -8395782, -8395782, + -8395782, -8395782, -8395782, -8395782, 41978, + 41978, 41978, 41978, 41978, 41978, + 41978, 41978, -151046689, 1830765815, -1929875198, + -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, + -1815525077, -1815525077, -1815525077, -1374673747, -1374673747, + -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, + -1091570561, -1929495947, -1929495947, -1929495947, -1929495947, + 515185417, 515185417, 515185417, 515185417, -285697463, + -285697463, -285697463, -285697463, 625853735, 625853735, + 625853735, 625853735, 1727305304, 1727305304, 2082316400, + 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, + -346752664, 684667771, 684667771, 1654287830, 1654287830, + -878576921, -878576921, -1257667337, -1257667337, -748618600, + -748618600, 329347125, 329347125, 1837364258, 1837364258, + -1443016191, -1443016191, -1170414139, -1170414139, -1846138265, + -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, + -1898723372, -594436433, -202001019, -475984260, -561427818, + 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, + 992097815, 879957084, 2024403852, 1484874664, -1636082790, + -285388938, -1983539117, -1495136972, -950076368, -1714807468, + -952438995, -1574918427, 1350681039, -2143979939, 1599739335, + -1285853323, -993005454, -1440787840, 568627424, -783134478, + -588790216, 289871779, -1262003603, 2135294594, -1018755525, + -889861155, 1665705315, 1321868265, 1225434135, -1784632064, + 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, + -518252220, 1957047970, 1146323031, -654783359, -1974159335, + 1651689966, 140455867, -1039411342, 1955560694, 1529189038, + -2131021878, -247357819, 1518161567, -86965173, 1708872713, + 1787797779, 1638590967, -120646188, -1669960606, -916321552, + 1155548552, 2143745726, 1210558298, -1261461890, -318346816, + 628664287, -1729304568, 1422575624, 1424130038, -1185330464, + 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, + 1014493059, 863641633, 1819892093, 2124962073, -1223601433, + -1920467227, -1637785316, -1536588520, 694382729, 235104446, + -1045062172, 831969619, -300448763, 756955444, -260312805, + 1554794072, 1339088280, -2040058690, -853476187, -2047270596, + -1723816713, -1591599803, -440824168, 1119856484, 1544891539, + 155290192, -973777462, 991903578, 912367099, -44694137, + 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, + -314284737, 985022747, 963438279, -1078959975, 604552167, + -1021949428, 608791570, 173440395, -2126092136, -1316619236, + -1039370342, 6087993, -110126092, 565464272, -1758099917, + -1600929361, 879867909, -1809756372, 400711272, 1363007700, + 30313375, -326425360, 1683520342, -517299994, 2027935492, + -1372618620, 128353682, -1123881663, 137583815, -635454918, + -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, + -384158533, 827959816, -596344473, 702390549, -279505433, + -260424530, -71875110, -1208667171, -1499603926, 2036925262, + -540420426, 746144248, -1420958686, 2032221021, 1904936414, + 1257750362, 1926727420, 1931587462, 1258381762, 885133339, + 1629985060, 1967222129, 6363718, -1287922800, 1136965286, + 1779436847, 1116720494, 1042326957, 1405999311, 713994583, + 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, + 201262505, 1934038751, 374860238, -3975713, 25847, + -2608894, -518909, 237124, -777960, -876248, + 466468, 1826347, 1826347, 1826347, 1826347, + 2353451, 2353451, 2353451, 2353451, -359251, + -359251, -359251, -359251, -2091905, -2091905, + -2091905, -2091905, 3119733, 3119733, 3119733, + 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, + 2680103, 2680103, 2680103, 2725464, 2725464, + 1024112, 1024112, -1079900, -1079900, 3585928, + 3585928, -549488, -549488, -1119584, -1119584, + 2619752, 2619752, -2108549, -2108549, -2118186, + -2118186, -3859737, -3859737, -1399561, -1399561, + -3277672, -3277672, 1757237, 1757237, -19422, + -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, + -3592148, -2537516, 3915439, -3861115, -3043716, + 3574422, -2867647, 3539968, -300467, 2348700, + -539299, -1699267, -1643818, 3505694, -3821735, + 3507263, -2140649, -1600420, 3699596, 811944, + 531354, 954230, 3881043, 3900724, -2556880, + 2071892, -2797779, -3930395, -3677745, -1452451, + 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, + -22981, -381987, 1852771, -3343383, 508951, + 44288, 904516, -3724342, 1653064, 2389356, + 759969, 189548, 3159746, -2409325, 1315589, + 1285669, -812732, -3019102, -3628969, -1528703, + -3041255, 3475950, -1585221, 1939314, -1000202, + -3157330, 126922, -983419, 2715295, -3693493, + -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, + -3249728, -210977, -1316856, -3553272, -1851402, + -177440, 1341330, -1584928, -1439742, -3881060, + 3839961, 2091667, -3342478, 266997, -3520352, + 900702, 495491, -655327, -3556995, 342297, + 3437287, 2842341, 4055324, -3767016, -2994039, + -1333058, -451100, -1279661, 1500165, -542412, + -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, + 183443, -554416, 3937738, 3407706, 2244091, + 2434439, -3759364, 1859098, -1613174, -3122442, + -525098, 286988, -3342277, 2691481, 1247620, + 1250494, 1869119, 1237275, 1312455, 1917081, + 777191, -2831860, -3724270, 2432395, 3369112, + 162844, 1652634, 3523897, -975884, 1723600, + -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, + -43260, 2031748, -768622, -2437823, 1735879, + -2590150, 2486353, 2635921, 1903435, -3318210, + 3306115, -2546312, 2235880, -1671176, 594136, + 2454455, 185531, 1616392, -3694233, 3866901, + 1717735, -1803090, -260646, -420899, 1612842, + -48306, -846154, 3817976, -3562462, 3513181, + -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, + -1050970, -1430225, -1962642, -1374803, 3406031, + -1846953, -3776993, -164721, -1207385, 3014001, + -1799107, 269760, 472078, 1910376, -3833893, + -2286327, -3545687, -1362209, 1976782, +}; + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_consts) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.h b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.h new file mode 100644 index 00000000000..5d48e115131 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/consts.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mldsa-native repository. + * Do not modify it directly. + */ + +#ifndef MLD_NATIVE_X86_64_SRC_CONSTS_H +#define MLD_NATIVE_X86_64_SRC_CONSTS_H +#include "../../../common.h" +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQ 0 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV 8 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV_QINV 16 +#define MLD_AVX2_BACKEND_DATA_OFFSET_8XDIV 24 +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS_QINV 32 +#define MLD_AVX2_BACKEND_DATA_OFFSET_ZETAS 328 + +#ifndef __ASSEMBLER__ +#define mld_qdata MLD_NAMESPACE(qdata) +MLD_INTERNAL_DATA_DECLARATION const int32_t mld_qdata[624]; +#endif + +#endif /* !MLD_NATIVE_X86_64_SRC_CONSTS_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/intt_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/intt_avx2_asm.S new file mode 100644 index 00000000000..88a277a1e3e --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/intt_avx2_asm.S @@ -0,0 +1,2308 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + + /* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/intt_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_invntt_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_invntt_avx2_asm) +S2N_BN_SYMBOL(mldsa_invntt_avx2_asm): + + .cfi_startproc + vmovdqa (%rsi), %ymm0 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpermq $0x1b, 0x500(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x9a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x480(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x920(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x400(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x380(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x820(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x300(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x280(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x720(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x200(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6a0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x180(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x620(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0x100(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm10, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm3, 0x80(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm8, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpermq $0x1b, 0x4e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x980(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x460(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x900(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x880(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x360(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x800(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x780(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x260(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x700(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1e0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x680(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x160(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x600(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xe0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x580(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm10, 0x120(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm4, 0x160(%rdi) + vmovdqa %ymm3, 0x180(%rdi) + vmovdqa %ymm5, 0x1a0(%rdi) + vmovdqa %ymm8, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpermq $0x1b, 0x4c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x960(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x440(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x860(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x340(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x760(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x240(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6e0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1c0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x660(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x140(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xc0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x560(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm10, 0x220(%rdi) + vmovdqa %ymm6, 0x240(%rdi) + vmovdqa %ymm4, 0x260(%rdi) + vmovdqa %ymm3, 0x280(%rdi) + vmovdqa %ymm5, 0x2a0(%rdi) + vmovdqa %ymm8, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpermq $0x1b, 0x4a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x940(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm5, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpermq $0x1b, 0x420(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x8c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x3a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x840(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm9, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpermq $0x1b, 0x320(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x7c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm10, %ymm11, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x2a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x740(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpermq $0x1b, 0x220(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x6c0(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpermq $0x1b, 0x1a0(%rsi), %ymm3 # ymm3 = mem[3,2,1,0] + vpermq $0x1b, 0x640(%rsi), %ymm15 # ymm15 = mem[3,2,1,0] + vmovshdup %ymm3, %ymm1 # ymm1 = ymm3[1,1,3,3,5,5,7,7] + vmovshdup %ymm15, %ymm2 # ymm2 = ymm15[1,1,3,3,5,5,7,7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm3, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm15, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] + vpsrlq $0x20, %ymm4, %ymm4 + vpblendd $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovsldup %ymm7, %ymm4 # ymm4 = ymm7[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7] + vpsrlq $0x20, %ymm6, %ymm6 + vpblendd $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7] + vpsrlq $0x20, %ymm8, %ymm8 + vpblendd $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7] + vmovsldup %ymm11, %ymm8 # ymm8 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vpermq $0x1b, 0x120(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x5c0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm4, %ymm7, %ymm12 + vpaddd %ymm7, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm6, %ymm9, %ymm12 + vpaddd %ymm6, %ymm9, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm8, %ymm11, %ymm12 + vpaddd %ymm11, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpunpcklqdq %ymm4, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] + vpunpckhqdq %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[1],ymm4[1],ymm3[3],ymm4[3] + vpunpcklqdq %ymm8, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] + vpunpckhqdq %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[1],ymm8[1],ymm6[3],ymm8[3] + vpunpcklqdq %ymm7, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm7[0],ymm5[2],ymm7[2] + vpunpckhqdq %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[1],ymm7[1],ymm5[3],ymm7[3] + vpunpcklqdq %ymm11, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vpermq $0x1b, 0xa0(%rsi), %ymm1 # ymm1 = mem[3,2,1,0] + vpermq $0x1b, 0x540(%rsi), %ymm2 # ymm2 = mem[3,2,1,0] + vpsubd %ymm10, %ymm4, %ymm12 + vpaddd %ymm4, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm4 # ymm4 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm4, %ymm4 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm4, %ymm12, %ymm4 # ymm4 = ymm12[0],ymm4[1],ymm12[2],ymm4[3],ymm12[4],ymm4[5],ymm12[6],ymm4[7] + vpsubd %ymm3, %ymm8, %ymm12 + vpaddd %ymm3, %ymm8, %ymm3 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm7, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpsubd %ymm5, %ymm11, %ymm12 + vpaddd %ymm5, %ymm11, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vperm2i128 $0x20, %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0,1],ymm3[0,1] + vperm2i128 $0x31, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[2,3],ymm3[2,3] + vperm2i128 $0x20, %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0,1],ymm5[0,1] + vperm2i128 $0x31, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[2,3],ymm5[2,3] + vperm2i128 $0x20, %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpsubd %ymm9, %ymm3, %ymm12 + vpaddd %ymm3, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm3 # ymm3 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm3, %ymm3 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm12, %ymm3 # ymm3 = ymm12[0],ymm3[1],ymm12[2],ymm3[3],ymm12[4],ymm3[5],ymm12[6],ymm3[7] + vpsubd %ymm10, %ymm5, %ymm12 + vpaddd %ymm5, %ymm10, %ymm10 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm5 # ymm5 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm5, %ymm5 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm5, %ymm12, %ymm5 # ymm5 = ymm12[0],ymm5[1],ymm12[2],ymm5[3],ymm12[4],ymm5[5],ymm12[6],ymm5[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm4, %ymm11, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm6, 0x340(%rdi) + vmovdqa %ymm4, 0x360(%rdi) + vmovdqa %ymm3, 0x380(%rdi) + vmovdqa %ymm5, 0x3a0(%rdi) + vmovdqa %ymm8, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpsubd %ymm4, %ymm6, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm6 # ymm6 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm6, %ymm6 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm6, %ymm12, %ymm6 # ymm6 = ymm12[0],ymm6[1],ymm12[2],ymm6[3],ymm12[4],ymm6[5],ymm12[6],ymm6[7] + vpsubd %ymm5, %ymm7, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm7 # ymm7 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm7, %ymm7 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm7, %ymm12, %ymm7 # ymm7 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4],ymm7[5],ymm12[6],ymm7[7] + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpsubd %ymm8, %ymm10, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm9, %ymm11, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vpbroadcastd 0x80(%rsi), %ymm1 + vpbroadcastd 0x520(%rsi), %ymm2 + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm8 # ymm8 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm8, %ymm8 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm12, %ymm8 # ymm8 = ymm12[0],ymm8[1],ymm12[2],ymm8[3],ymm12[4],ymm8[5],ymm12[6],ymm8[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm9 # ymm9 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm9, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm9, %ymm9 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm9, %ymm12, %ymm9 # ymm9 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4],ymm9[5],ymm12[6],ymm9[7] + vpsubd %ymm6, %ymm10, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm10 # ymm10 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm10, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm10, %ymm10 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm10, %ymm12, %ymm10 # ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7] + vpsubd %ymm7, %ymm11, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vpmuldq %ymm1, %ymm12, %ymm13 + vmovshdup %ymm12, %ymm11 # ymm11 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm14 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpsubd %ymm13, %ymm12, %ymm12 + vpsubd %ymm14, %ymm11, %ymm11 + vmovshdup %ymm12, %ymm12 # ymm12 = ymm12[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm11, %ymm12, %ymm11 # ymm11 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4],ymm11[5],ymm12[6],ymm11[7] + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa 0x40(%rsi), %ymm1 + vmovdqa 0x60(%rsi), %ymm2 + vpmuldq %ymm1, %ymm4, %ymm12 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm4, %ymm8 # ymm8 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm9 # ymm9 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm4, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm9[1],ymm5[2],ymm9[3],ymm5[4],ymm9[5],ymm5[6],ymm9[7] + vpmuldq %ymm1, %ymm6, %ymm12 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm6, %ymm8 # ymm8 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm9 # ymm9 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm14 + vpmuldq %ymm1, %ymm9, %ymm15 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm0, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vpmuldq %ymm0, %ymm15, %ymm15 + vpsubd %ymm12, %ymm6, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vpsubd %ymm14, %ymm8, %ymm8 + vpsubd %ymm15, %ymm9, %ymm9 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm8, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7] + vpblendd $0xaa, %ymm9, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_invntt_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/ntt_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/ntt_avx2_asm.S new file mode 100644 index 00000000000..71d7cffc8c8 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/ntt_avx2_asm.S @@ -0,0 +1,2380 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + + /* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/ntt_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_ntt_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_ntt_avx2_asm) +S2N_BN_SYMBOL(mldsa_ntt_avx2_asm): + + .cfi_startproc + vmovdqa (%rsi), %ymm0 + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa (%rdi), %ymm4 + vmovdqa 0x80(%rdi), %ymm5 + vmovdqa 0x100(%rdi), %ymm6 + vmovdqa 0x180(%rdi), %ymm7 + vmovdqa 0x200(%rdi), %ymm8 + vmovdqa 0x280(%rdi), %ymm9 + vmovdqa 0x300(%rdi), %ymm10 + vmovdqa 0x380(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, (%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm6, 0x100(%rdi) + vmovdqa %ymm7, 0x180(%rdi) + vmovdqa %ymm8, 0x200(%rdi) + vmovdqa %ymm9, 0x280(%rdi) + vmovdqa %ymm10, 0x300(%rdi) + vmovdqa %ymm11, 0x380(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0xa0(%rdi), %ymm5 + vmovdqa 0x120(%rdi), %ymm6 + vmovdqa 0x1a0(%rdi), %ymm7 + vmovdqa 0x220(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x320(%rdi), %ymm10 + vmovdqa 0x3a0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm5, 0xa0(%rdi) + vmovdqa %ymm6, 0x120(%rdi) + vmovdqa %ymm7, 0x1a0(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm9, 0x2a0(%rdi) + vmovdqa %ymm10, 0x320(%rdi) + vmovdqa %ymm11, 0x3a0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x40(%rdi), %ymm4 + vmovdqa 0xc0(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x1c0(%rdi), %ymm7 + vmovdqa 0x240(%rdi), %ymm8 + vmovdqa 0x2c0(%rdi), %ymm9 + vmovdqa 0x340(%rdi), %ymm10 + vmovdqa 0x3c0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x40(%rdi) + vmovdqa %ymm5, 0xc0(%rdi) + vmovdqa %ymm6, 0x140(%rdi) + vmovdqa %ymm7, 0x1c0(%rdi) + vmovdqa %ymm8, 0x240(%rdi) + vmovdqa %ymm9, 0x2c0(%rdi) + vmovdqa %ymm10, 0x340(%rdi) + vmovdqa %ymm11, 0x3c0(%rdi) + vpbroadcastd 0x84(%rsi), %ymm1 + vpbroadcastd 0x524(%rsi), %ymm2 + vmovdqa 0x60(%rdi), %ymm4 + vmovdqa 0xe0(%rdi), %ymm5 + vmovdqa 0x160(%rdi), %ymm6 + vmovdqa 0x1e0(%rdi), %ymm7 + vmovdqa 0x260(%rdi), %ymm8 + vmovdqa 0x2e0(%rdi), %ymm9 + vmovdqa 0x360(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vpbroadcastd 0x88(%rsi), %ymm1 + vpbroadcastd 0x528(%rsi), %ymm2 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm5, %ymm12 + vpaddd %ymm7, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm5, %ymm5 + vpbroadcastd 0x8c(%rsi), %ymm1 + vpbroadcastd 0x52c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa %ymm4, 0x60(%rdi) + vmovdqa %ymm5, 0xe0(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm7, 0x1e0(%rdi) + vmovdqa %ymm8, 0x260(%rdi) + vmovdqa %ymm9, 0x2e0(%rdi) + vmovdqa %ymm10, 0x360(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vpbroadcastd 0x90(%rsi), %ymm1 + vpbroadcastd 0x530(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xa0(%rsi), %ymm1 + vmovdqa 0x540(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x120(%rsi), %ymm1 + vmovdqa 0x5c0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1a0(%rsi), %ymm1 + vmovdqa 0x640(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x220(%rsi), %ymm1 + vmovdqa 0x6c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2a0(%rsi), %ymm1 + vmovdqa 0x740(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x320(%rsi), %ymm1 + vmovdqa 0x7c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3a0(%rsi), %ymm1 + vmovdqa 0x840(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x420(%rsi), %ymm1 + vmovdqa 0x8c0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4a0(%rsi), %ymm1 + vmovdqa 0x940(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vpbroadcastd 0x94(%rsi), %ymm1 + vpbroadcastd 0x534(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xc0(%rsi), %ymm1 + vmovdqa 0x560(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x140(%rsi), %ymm1 + vmovdqa 0x5e0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1c0(%rsi), %ymm1 + vmovdqa 0x660(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x240(%rsi), %ymm1 + vmovdqa 0x6e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2c0(%rsi), %ymm1 + vmovdqa 0x760(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x340(%rsi), %ymm1 + vmovdqa 0x7e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3c0(%rsi), %ymm1 + vmovdqa 0x860(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x440(%rsi), %ymm1 + vmovdqa 0x8e0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4c0(%rsi), %ymm1 + vmovdqa 0x960(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vpbroadcastd 0x98(%rsi), %ymm1 + vpbroadcastd 0x538(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0xe0(%rsi), %ymm1 + vmovdqa 0x580(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x160(%rsi), %ymm1 + vmovdqa 0x600(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x1e0(%rsi), %ymm1 + vmovdqa 0x680(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x260(%rsi), %ymm1 + vmovdqa 0x700(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x2e0(%rsi), %ymm1 + vmovdqa 0x780(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x360(%rsi), %ymm1 + vmovdqa 0x800(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x3e0(%rsi), %ymm1 + vmovdqa 0x880(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x460(%rsi), %ymm1 + vmovdqa 0x900(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x4e0(%rsi), %ymm1 + vmovdqa 0x980(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vpbroadcastd 0x9c(%rsi), %ymm1 + vpbroadcastd 0x53c(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm4, %ymm12 + vpaddd %ymm4, %ymm8, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm9, %ymm13 + vmovshdup %ymm9, %ymm12 # ymm12 = ymm9[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm9, %ymm9 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm9, %ymm9 # ymm9 = ymm9[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm9, %ymm9 # ymm9 = ymm9[0],ymm12[1],ymm9[2],ymm12[3],ymm9[4],ymm12[5],ymm9[6],ymm12[7] + vpsubd %ymm9, %ymm5, %ymm12 + vpaddd %ymm5, %ymm9, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm9 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm6, %ymm12 + vpaddd %ymm6, %ymm10, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm6, %ymm6 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm7, %ymm12 + vpaddd %ymm7, %ymm11, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm7, %ymm7 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vmovdqa 0x100(%rsi), %ymm1 + vmovdqa 0x5a0(%rsi), %ymm2 + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm3, %ymm12 + vpaddd %ymm5, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm10, %ymm13 + vmovshdup %ymm10, %ymm12 # ymm12 = ymm10[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm10, %ymm10 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm10, %ymm10 # ymm10 = ymm10[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm10, %ymm10 # ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4],ymm12[5],ymm10[6],ymm12[7] + vpsubd %ymm10, %ymm8, %ymm12 + vpaddd %ymm10, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm10 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm4, %ymm12 + vpaddd %ymm6, %ymm4, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm4, %ymm4 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm9, %ymm12 + vpaddd %ymm11, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm9, %ymm9 + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovdqa 0x180(%rsi), %ymm1 + vmovdqa 0x620(%rsi), %ymm2 + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm7, %ymm12 + vpaddd %ymm7, %ymm8, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm5, %ymm12 + vpaddd %ymm6, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm3, %ymm12 + vpaddd %ymm4, %ymm3, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm3, %ymm3 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm2, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm10, %ymm12 + vpaddd %ymm11, %ymm10, %ymm10 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm10, %ymm10 + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa 0x200(%rsi), %ymm1 + vmovdqa 0x6a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm5, %ymm13 + vmovshdup %ymm5, %ymm12 # ymm12 = ymm5[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm5, %ymm5 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm5, %ymm5 # ymm5 = ymm5[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm5, %ymm5 # ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3],ymm5[4],ymm12[5],ymm5[6],ymm12[7] + vpsubd %ymm5, %ymm9, %ymm12 + vpaddd %ymm5, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm5 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm8, %ymm12 + vpaddd %ymm4, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm8, %ymm8 + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm7, %ymm12 + vpaddd %ymm3, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm7, %ymm7 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm6, %ymm12 + vpaddd %ymm6, %ymm11, %ymm6 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm6, %ymm6 + vmovdqa 0x280(%rsi), %ymm1 + vmovdqa 0x720(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm7, %ymm13 + vmovshdup %ymm7, %ymm12 # ymm12 = ymm7[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm7, %ymm7 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm7, %ymm7 # ymm7 = ymm7[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm7, %ymm7 # ymm7 = ymm7[0],ymm12[1],ymm7[2],ymm12[3],ymm7[4],ymm12[5],ymm7[6],ymm12[7] + vpsubd %ymm7, %ymm9, %ymm12 + vpaddd %ymm7, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm7 + vpsubd %ymm13, %ymm9, %ymm9 + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm8, %ymm12 + vpaddd %ymm6, %ymm8, %ymm8 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm8, %ymm8 + vmovdqa 0x300(%rsi), %ymm1 + vmovdqa 0x7a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm3, %ymm13 + vmovshdup %ymm3, %ymm12 # ymm12 = ymm3[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm3, %ymm3 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm3, %ymm3 # ymm3 = ymm3[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm3, %ymm3 # ymm3 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4],ymm12[5],ymm3[6],ymm12[7] + vpsubd %ymm3, %ymm5, %ymm12 + vpaddd %ymm3, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm3 + vpsubd %ymm13, %ymm5, %ymm5 + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm4, %ymm12 + vpaddd %ymm4, %ymm11, %ymm4 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm4, %ymm4 + vmovdqa 0x380(%rsi), %ymm1 + vmovdqa 0x820(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm8, %ymm13 + vmovshdup %ymm8, %ymm12 # ymm12 = ymm8[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm8, %ymm8 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm8, %ymm8 # ymm8 = ymm8[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm8, %ymm8 # ymm8 = ymm8[0],ymm12[1],ymm8[2],ymm12[3],ymm8[4],ymm12[5],ymm8[6],ymm12[7] + vpsubd %ymm8, %ymm9, %ymm12 + vpaddd %ymm8, %ymm9, %ymm9 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm8 + vpsubd %ymm13, %ymm9, %ymm9 + vmovdqa 0x400(%rsi), %ymm1 + vmovdqa 0x8a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm6, %ymm13 + vmovshdup %ymm6, %ymm12 # ymm12 = ymm6[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4],ymm12[5],ymm6[6],ymm12[7] + vpsubd %ymm6, %ymm7, %ymm12 + vpaddd %ymm6, %ymm7, %ymm7 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm6 + vpsubd %ymm13, %ymm7, %ymm7 + vmovdqa 0x480(%rsi), %ymm1 + vmovdqa 0x920(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm4, %ymm13 + vmovshdup %ymm4, %ymm12 # ymm12 = ymm4[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm4, %ymm4 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm12[1],ymm4[2],ymm12[3],ymm4[4],ymm12[5],ymm4[6],ymm12[7] + vpsubd %ymm4, %ymm5, %ymm12 + vpaddd %ymm4, %ymm5, %ymm5 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm4 + vpsubd %ymm13, %ymm5, %ymm5 + vmovdqa 0x500(%rsi), %ymm1 + vmovdqa 0x9a0(%rsi), %ymm2 + vpsrlq $0x20, %ymm1, %ymm10 + vmovshdup %ymm2, %ymm15 # ymm15 = ymm2[1,1,3,3,5,5,7,7] + vpmuldq %ymm1, %ymm11, %ymm13 + vmovshdup %ymm11, %ymm12 # ymm12 = ymm11[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm12, %ymm14 + vpmuldq %ymm2, %ymm11, %ymm11 + vpmuldq %ymm15, %ymm12, %ymm12 + vpmuldq %ymm0, %ymm13, %ymm13 + vpmuldq %ymm0, %ymm14, %ymm14 + vmovshdup %ymm11, %ymm11 # ymm11 = ymm11[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm12, %ymm11, %ymm11 # ymm11 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4],ymm12[5],ymm11[6],ymm12[7] + vpsubd %ymm11, %ymm3, %ymm12 + vpaddd %ymm3, %ymm11, %ymm3 + vmovshdup %ymm13, %ymm13 # ymm13 = ymm13[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm14, %ymm13, %ymm13 # ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4],ymm14[5],ymm13[6],ymm14[7] + vpaddd %ymm13, %ymm12, %ymm11 + vpsubd %ymm13, %ymm3, %ymm3 + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_ntt_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/nttunpack_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/nttunpack_avx2_asm.S new file mode 100644 index 00000000000..104282c4990 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/nttunpack_avx2_asm.S @@ -0,0 +1,235 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/nttunpack_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_nttunpack_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_nttunpack_avx2_asm) +S2N_BN_SYMBOL(mldsa_nttunpack_avx2_asm): + + .cfi_startproc + vmovdqa (%rdi), %ymm4 + vmovdqa 0x20(%rdi), %ymm5 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa 0x60(%rdi), %ymm7 + vmovdqa 0x80(%rdi), %ymm8 + vmovdqa 0xa0(%rdi), %ymm9 + vmovdqa 0xc0(%rdi), %ymm10 + vmovdqa 0xe0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, (%rdi) + vmovdqa %ymm8, 0x20(%rdi) + vmovdqa %ymm7, 0x40(%rdi) + vmovdqa %ymm6, 0x60(%rdi) + vmovdqa %ymm5, 0x80(%rdi) + vmovdqa %ymm4, 0xa0(%rdi) + vmovdqa %ymm3, 0xc0(%rdi) + vmovdqa %ymm11, 0xe0(%rdi) + vmovdqa 0x100(%rdi), %ymm4 + vmovdqa 0x120(%rdi), %ymm5 + vmovdqa 0x140(%rdi), %ymm6 + vmovdqa 0x160(%rdi), %ymm7 + vmovdqa 0x180(%rdi), %ymm8 + vmovdqa 0x1a0(%rdi), %ymm9 + vmovdqa 0x1c0(%rdi), %ymm10 + vmovdqa 0x1e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x100(%rdi) + vmovdqa %ymm8, 0x120(%rdi) + vmovdqa %ymm7, 0x140(%rdi) + vmovdqa %ymm6, 0x160(%rdi) + vmovdqa %ymm5, 0x180(%rdi) + vmovdqa %ymm4, 0x1a0(%rdi) + vmovdqa %ymm3, 0x1c0(%rdi) + vmovdqa %ymm11, 0x1e0(%rdi) + vmovdqa 0x200(%rdi), %ymm4 + vmovdqa 0x220(%rdi), %ymm5 + vmovdqa 0x240(%rdi), %ymm6 + vmovdqa 0x260(%rdi), %ymm7 + vmovdqa 0x280(%rdi), %ymm8 + vmovdqa 0x2a0(%rdi), %ymm9 + vmovdqa 0x2c0(%rdi), %ymm10 + vmovdqa 0x2e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x200(%rdi) + vmovdqa %ymm8, 0x220(%rdi) + vmovdqa %ymm7, 0x240(%rdi) + vmovdqa %ymm6, 0x260(%rdi) + vmovdqa %ymm5, 0x280(%rdi) + vmovdqa %ymm4, 0x2a0(%rdi) + vmovdqa %ymm3, 0x2c0(%rdi) + vmovdqa %ymm11, 0x2e0(%rdi) + vmovdqa 0x300(%rdi), %ymm4 + vmovdqa 0x320(%rdi), %ymm5 + vmovdqa 0x340(%rdi), %ymm6 + vmovdqa 0x360(%rdi), %ymm7 + vmovdqa 0x380(%rdi), %ymm8 + vmovdqa 0x3a0(%rdi), %ymm9 + vmovdqa 0x3c0(%rdi), %ymm10 + vmovdqa 0x3e0(%rdi), %ymm11 + vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1] + vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3] + vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1] + vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3] + vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1] + vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3] + vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1] + vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3] + vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] + vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] + vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2] + vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] + vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2] + vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3] + vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2] + vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3] + vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7] + vpsrlq $0x20, %ymm7, %ymm7 + vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7] + vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7] + vpsrlq $0x20, %ymm5, %ymm5 + vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7] + vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7] + vpsrlq $0x20, %ymm3, %ymm3 + vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] + vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6] + vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7] + vpsrlq $0x20, %ymm10, %ymm10 + vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7] + vmovdqa %ymm9, 0x300(%rdi) + vmovdqa %ymm8, 0x320(%rdi) + vmovdqa %ymm7, 0x340(%rdi) + vmovdqa %ymm6, 0x360(%rdi) + vmovdqa %ymm5, 0x380(%rdi) + vmovdqa %ymm4, 0x3a0(%rdi) + vmovdqa %ymm3, 0x3c0(%rdi) + vmovdqa %ymm11, 0x3e0(%rdi) + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_nttunpack_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l4_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l4_avx2_asm.S new file mode 100644 index 00000000000..95d348e15c8 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l4_avx2_asm.S @@ -0,0 +1,135 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l4_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_pointwise_acc_l4_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_pointwise_acc_l4_avx2_asm) +S2N_BN_SYMBOL(mldsa_pointwise_acc_l4_avx2_asm): + + .cfi_startproc + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax + +Lpointwise_acc_l4_avx2_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l4_avx2_looptop2 + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_pointwise_acc_l4_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l5_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l5_avx2_asm.S new file mode 100644 index 00000000000..ef9fe80e1a0 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l5_avx2_asm.S @@ -0,0 +1,151 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l5_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_pointwise_acc_l5_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_pointwise_acc_l5_avx2_asm) +S2N_BN_SYMBOL(mldsa_pointwise_acc_l5_avx2_asm): + + .cfi_startproc + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax + +Lpointwise_acc_l5_avx2_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l5_avx2_looptop2 + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_pointwise_acc_l5_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l7_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l7_avx2_asm.S new file mode 100644 index 00000000000..018bb0d0866 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_acc_l7_avx2_asm.S @@ -0,0 +1,183 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_acc_l7_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_pointwise_acc_l7_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_pointwise_acc_l7_avx2_asm) +S2N_BN_SYMBOL(mldsa_pointwise_acc_l7_avx2_asm): + + .cfi_startproc + vmovdqa 0x20(%rcx), %ymm0 + vmovdqa (%rcx), %ymm1 + xorl %eax, %eax + +Lpointwise_acc_l7_avx2_looptop2: + vmovdqa (%rsi), %ymm6 + vmovdqa 0x20(%rsi), %ymm8 + vmovdqa (%rdx), %ymm10 + vmovdqa 0x20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vmovdqa %ymm6, %ymm2 + vmovdqa %ymm7, %ymm3 + vmovdqa %ymm8, %ymm4 + vmovdqa %ymm9, %ymm5 + vmovdqa 0x400(%rsi), %ymm6 + vmovdqa 0x420(%rsi), %ymm8 + vmovdqa 0x400(%rdx), %ymm10 + vmovdqa 0x420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x800(%rsi), %ymm6 + vmovdqa 0x820(%rsi), %ymm8 + vmovdqa 0x800(%rdx), %ymm10 + vmovdqa 0x820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0xc00(%rsi), %ymm6 + vmovdqa 0xc20(%rsi), %ymm8 + vmovdqa 0xc00(%rdx), %ymm10 + vmovdqa 0xc20(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1000(%rsi), %ymm6 + vmovdqa 0x1020(%rsi), %ymm8 + vmovdqa 0x1000(%rdx), %ymm10 + vmovdqa 0x1020(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1400(%rsi), %ymm6 + vmovdqa 0x1420(%rsi), %ymm8 + vmovdqa 0x1400(%rdx), %ymm10 + vmovdqa 0x1420(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vmovdqa 0x1800(%rsi), %ymm6 + vmovdqa 0x1820(%rsi), %ymm8 + vmovdqa 0x1800(%rdx), %ymm10 + vmovdqa 0x1820(%rdx), %ymm12 + vpsrlq $0x20, %ymm6, %ymm7 + vpsrlq $0x20, %ymm8, %ymm9 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm6, %ymm6 + vpmuldq %ymm11, %ymm7, %ymm7 + vpmuldq %ymm12, %ymm8, %ymm8 + vpmuldq %ymm13, %ymm9, %ymm9 + vpaddq %ymm2, %ymm6, %ymm2 + vpaddq %ymm3, %ymm7, %ymm3 + vpaddq %ymm4, %ymm8, %ymm4 + vpaddq %ymm5, %ymm9, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm6 + vpmuldq %ymm3, %ymm0, %ymm7 + vpmuldq %ymm4, %ymm0, %ymm8 + vpmuldq %ymm5, %ymm0, %ymm9 + vpmuldq %ymm6, %ymm1, %ymm6 + vpmuldq %ymm7, %ymm1, %ymm7 + vpmuldq %ymm8, %ymm1, %ymm8 + vpmuldq %ymm9, %ymm1, %ymm9 + vpsubq %ymm6, %ymm2, %ymm2 + vpsubq %ymm7, %ymm3, %ymm3 + vpsubq %ymm8, %ymm4, %ymm4 + vpsubq %ymm9, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + addq $0x40, %rsi + addq $0x40, %rdx + addq $0x40, %rdi + addl $0x1, %eax + cmpl $0x10, %eax + jb Lpointwise_acc_l7_avx2_looptop2 + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_pointwise_acc_l7_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_avx2_asm.S new file mode 100644 index 00000000000..8fffdc98526 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/pointwise_avx2_asm.S @@ -0,0 +1,127 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "_internal_s2n_bignum_x86_att.h" + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/pointwise_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_pointwise_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_pointwise_avx2_asm) +S2N_BN_SYMBOL(mldsa_pointwise_avx2_asm): + + .cfi_startproc + vmovdqa 0x20(%rdx), %ymm0 + vmovdqa (%rdx), %ymm1 + xorl %eax, %eax + +Lpointwise_avx2_looptop1: + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa 0x40(%rdi), %ymm6 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 + vmovdqa 0x40(%rsi), %ymm14 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7] + vpsrlq $0x20, %ymm10, %ymm11 + vpsrlq $0x20, %ymm12, %ymm13 + vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm14, %ymm6, %ymm6 + vpmuldq %ymm15, %ymm7, %ymm7 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm6, %ymm0, %ymm14 + vpmuldq %ymm7, %ymm0, %ymm15 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpmuldq %ymm14, %ymm1, %ymm14 + vpmuldq %ymm15, %ymm1, %ymm15 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsubq %ymm14, %ymm6, %ymm6 + vpsubq %ymm15, %ymm7, %ymm7 + vpsrlq $0x20, %ymm2, %ymm2 + vpsrlq $0x20, %ymm4, %ymm4 + vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7] + vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + vmovdqa %ymm6, 0x40(%rdi) + addq $0x60, %rdi + addq $0x60, %rsi + addl $0x1, %eax + cmpl $0xa, %eax + jb Lpointwise_avx2_looptop1 + vmovdqa (%rdi), %ymm2 + vmovdqa 0x20(%rdi), %ymm4 + vmovdqa (%rsi), %ymm10 + vmovdqa 0x20(%rsi), %ymm12 + vpsrlq $0x20, %ymm2, %ymm3 + vpsrlq $0x20, %ymm4, %ymm5 + vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7] + vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7] + vpmuldq %ymm10, %ymm2, %ymm2 + vpmuldq %ymm11, %ymm3, %ymm3 + vpmuldq %ymm12, %ymm4, %ymm4 + vpmuldq %ymm13, %ymm5, %ymm5 + vpmuldq %ymm2, %ymm0, %ymm10 + vpmuldq %ymm3, %ymm0, %ymm11 + vpmuldq %ymm4, %ymm0, %ymm12 + vpmuldq %ymm5, %ymm0, %ymm13 + vpmuldq %ymm10, %ymm1, %ymm10 + vpmuldq %ymm11, %ymm1, %ymm11 + vpmuldq %ymm12, %ymm1, %ymm12 + vpmuldq %ymm13, %ymm1, %ymm13 + vpsubq %ymm10, %ymm2, %ymm2 + vpsubq %ymm11, %ymm3, %ymm3 + vpsubq %ymm12, %ymm4, %ymm4 + vpsubq %ymm13, %ymm5, %ymm5 + vpsrlq $0x20, %ymm2, %ymm2 + vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7] + vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] + vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7] + vmovdqa %ymm2, (%rdi) + vmovdqa %ymm4, 0x20(%rdi) + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_pointwise_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa/packing.c b/crypto/fipsmodule/ml_dsa/mldsa/packing.c index 26da2738560..6d2c4790994 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/packing.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/packing.c @@ -8,53 +8,38 @@ #include "packing.h" #include "poly.h" #include "polyvec.h" +#include "rounding.h" /* Parameter set namespacing * This is to facilitate building multiple instances * of mldsa-native (e.g. with varying parameter sets) * within a single compilation unit. */ -#define mld_unpack_hints MLD_ADD_PARAM_SET(mld_unpack_hints) /* End of parameter set namespacing */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API -void mld_pack_pk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1) +void mld_unpack_pk_t1(mld_poly *t1, + const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], + unsigned int i) { - unsigned int i; - - mld_memcpy(pk, rho, MLDSA_SEEDBYTES); - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) - invariant(i <= MLDSA_K) - ) - { - mld_polyt1_pack(pk + MLDSA_SEEDBYTES + i * MLDSA_POLYT1_PACKEDBYTES, - &t1->vec[i]); - } + mld_polyt1_unpack(t1, pk + MLDSA_SEEDBYTES + i * MLDSA_POLYT1_PACKEDBYTES); } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API -void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1, - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]) +void mld_pack_sk_s1(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], + const mld_polyvecl *s1) { - unsigned int i; - - mld_memcpy(rho, pk, MLDSA_SEEDBYTES); - pk += MLDSA_SEEDBYTES; - - for (i = 0; i < MLDSA_K; ++i) - { - mld_polyt1_unpack(&t1->vec[i], pk + i * MLDSA_POLYT1_PACKEDBYTES); - } + mld_polyvecl_pack_eta(sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES, s1); } MLD_INTERNAL_API -void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[MLDSA_SEEDBYTES], - const uint8_t tr[MLDSA_TRBYTES], - const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0, - const mld_polyvecl *s1, const mld_polyveck *s2) +void mld_pack_sk_rho_key_tr_s2(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[MLDSA_SEEDBYTES], + const uint8_t tr[MLDSA_TRBYTES], + const uint8_t key[MLDSA_SEEDBYTES], + const mld_polyveck *s2) { mld_memcpy(sk, rho, MLDSA_SEEDBYTES); sk += MLDSA_SEEDBYTES; @@ -65,19 +50,19 @@ void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], mld_memcpy(sk, tr, MLDSA_TRBYTES); sk += MLDSA_TRBYTES; - mld_polyvecl_pack_eta(sk, s1); + /* s1 already packed via mld_pack_sk_s1 */ sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES; mld_polyveck_pack_eta(sk, s2); - sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES; - - mld_polyveck_pack_t0(sk, t0); + /* t0 already packed via mld_compute_pack_t0_t1 */ } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_INTERNAL_API void mld_unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES], - uint8_t key[MLDSA_SEEDBYTES], mld_polyveck *t0, - mld_polyvecl *s1, mld_polyveck *s2, + uint8_t key[MLDSA_SEEDBYTES], mld_sk_t0hat *t0, + mld_sk_s1hat *s1, mld_sk_s2hat *s2, const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]) { mld_memcpy(rho, sk, MLDSA_SEEDBYTES); @@ -89,83 +74,83 @@ void mld_unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES], mld_memcpy(tr, sk, MLDSA_TRBYTES); sk += MLDSA_TRBYTES; - mld_polyvecl_unpack_eta(s1, sk); + mld_unpack_sk_s1hat(s1, sk); sk += MLDSA_L * MLDSA_POLYETA_PACKEDBYTES; - mld_polyveck_unpack_eta(s2, sk); + mld_unpack_sk_s2hat(s2, sk); sk += MLDSA_K * MLDSA_POLYETA_PACKEDBYTES; - mld_polyveck_unpack_t0(t0, sk); + mld_unpack_sk_t0hat(t0, sk); } MLD_INTERNAL_API -void mld_pack_sig_c_h(uint8_t sig[MLDSA_CRYPTO_BYTES], - const uint8_t c[MLDSA_CTILDEBYTES], const mld_polyveck *h, - const unsigned int number_of_hints) +void mld_pack_sig_c(uint8_t sig[MLDSA_CRYPTO_BYTES], + const uint8_t c[MLDSA_CTILDEBYTES]) { - unsigned int i, j, k; - mld_memcpy(sig, c, MLDSA_CTILDEBYTES); - sig += MLDSA_CTILDEBYTES; - - /* skip z component - packed via mld_pack_sig_z */ - sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; +} - /* Encode hints h */ +MLD_INTERNAL_API +int mld_pack_sig_h(uint8_t sig[MLDSA_CRYPTO_BYTES], const mld_polyveck *w0, + const mld_polyveck *w1) +{ + unsigned int j, k, n; - /* The final section of sig[] is MLDSA_POLYVECH_PACKEDBYTES long, where - * MLDSA_POLYVECH_PACKEDBYTES = MLDSA_OMEGA + MLDSA_K + /* The hint section of sig[] is MLDSA_POLYVECH_PACKEDBYTES long, where + * MLDSA_POLYVECH_PACKEDBYTES = MLDSA_OMEGA + MLDSA_K. * * The first OMEGA bytes record the index numbers of the coefficients - * that are not equal to 0 + * that are not equal to 0. * * The final K bytes record a running tally of the number of hints - * coming from each of the K polynomials in h. - * - * The pre-condition tells us that number_of_hints <= OMEGA, so some - * bytes may not be written, so we initialize all of them to zero - * to start. - */ - mld_memset(sig, 0, MLDSA_POLYVECH_PACKEDBYTES); + * coming from each of the K polynomials. */ + uint8_t *sig_h = sig + MLDSA_CTILDEBYTES + MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; + + mld_memset(sig_h, 0, MLDSA_POLYVECH_PACKEDBYTES); + n = 0; - k = 0; - /* For each polynomial in h... */ - for (i = 0; i < MLDSA_K; ++i) + /* For each coefficient of each polynomial, compute its hint bit and, if + * non-zero, record the index in the hint section of sig. If recording the + * hint would overflow the OMEGA-sized index array, abort early and return + * MLD_ERR_FAIL. The caller is expected to reject the signature in that case. + * + * Constant time: At this point w0/w1 are public (see comment in sign.c + * before the call), so a data-dependent early return is fine. */ + for (k = 0; k < MLDSA_K; k++) __loop__( - assigns(i, j, k, memory_slice(sig, MLDSA_POLYVECH_PACKEDBYTES)) - invariant(i <= MLDSA_K) - invariant(k <= number_of_hints) - invariant(number_of_hints <= MLDSA_OMEGA) + assigns(k, j, n, memory_slice(sig_h, MLDSA_POLYVECH_PACKEDBYTES)) + invariant(k <= MLDSA_K && n <= MLDSA_OMEGA) + decreases(MLDSA_K - k) ) { - /* For each coefficient in that polynomial, record it as as hint */ - /* if its value is not zero */ - for (j = 0; j < MLDSA_N; ++j) + for (j = 0; j < MLDSA_N; j++) __loop__( - assigns(j, k, memory_slice(sig, MLDSA_POLYVECH_PACKEDBYTES)) - invariant(i <= MLDSA_K) - invariant(j <= MLDSA_N) - invariant(k <= number_of_hints) - invariant(number_of_hints <= MLDSA_OMEGA) + assigns(j, n, memory_slice(sig_h, MLDSA_POLYVECH_PACKEDBYTES)) + invariant(j <= MLDSA_N && n <= MLDSA_OMEGA) + decreases(MLDSA_N - j) ) { - /* The reference implementation implicitly relies on the total */ - /* number of hints being less than OMEGA, assuming h is valid. */ - /* In mldsa-native, we check this explicitly to ease proof of */ - /* type safety. */ - if (h->vec[i].coeffs[j] != 0 && k < number_of_hints) + const unsigned int hint_bit = + mld_make_hint(w0->vec[k].coeffs[j], w1->vec[k].coeffs[j]); + if (hint_bit) { - /* The enclosing if condition AND the loop invariant infer */ - /* that k < MLDSA_OMEGA, so writing to sig[k] is safe and k */ - /* can be incremented. */ - sig[k++] = (uint8_t)j; + if (n == MLDSA_OMEGA) + { + return MLD_ERR_FAIL; + } + /* Safety: branch above ensures n < MLDSA_OMEGA so n is a valid index + * into the OMEGA-sized index array; j < MLDSA_N <= 256 fits in + * uint8_t. */ + sig_h[n] = (uint8_t)j; + n++; } } - /* Having recorded all the hints for this polynomial, also */ - /* record the running tally into the correct "slot" for that */ - /* coefficient in the final K bytes */ - sig[MLDSA_OMEGA + i] = (uint8_t)k; + /* Record the running tally into the correct slot for this polynomial. + * Safety: k < MLDSA_K, so MLDSA_OMEGA + k is a valid index into the + * K-byte tally tail; n <= MLDSA_OMEGA fits in uint8_t. */ + sig_h[MLDSA_OMEGA + k] = (uint8_t)n; } + return 0; } MLD_INTERNAL_API @@ -176,111 +161,62 @@ void mld_pack_sig_z(uint8_t sig[MLDSA_CRYPTO_BYTES], const mld_poly *zi, sig += i * MLDSA_POLYZ_PACKEDBYTES; mld_polyz_pack(sig, zi); } +#endif /* !MLD_CONFIG_NO_SIGN_API */ -/************************************************* - * Name: mld_unpack_hints - * - * Description: Unpack raw hint bytes into a polyveck - * struct - * - * Arguments: - mld_polyveck *h: pointer to output hint vector h - * - const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES]: - * raw hint bytes - * - * Returns 1 in case of malformed hints; otherwise 0. - **************************************************/ -static int mld_unpack_hints( - mld_polyveck *h, const uint8_t packed_hints[MLDSA_POLYVECH_PACKEDBYTES]) -__contract__( - requires(memory_no_alias(packed_hints, MLDSA_POLYVECH_PACKEDBYTES)) - requires(memory_no_alias(h, sizeof(mld_polyveck))) - assigns(memory_slice(h, sizeof(mld_polyveck))) - /* All returned coefficients are either 0 or 1 */ - ensures(forall(k1, 0, MLDSA_K, - array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - ensures(return_value >= 0 && return_value <= 1) -) +#if !defined(MLD_CONFIG_NO_VERIFY_API) +MLD_INTERNAL_API +int mld_sig_unpack_hints(mld_poly *h, const uint8_t sig[MLDSA_CRYPTO_BYTES], + unsigned int i) { - unsigned int i, j; - unsigned int old_hint_count; + const uint8_t *packed_hints = + sig + MLDSA_CTILDEBYTES + MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; + const unsigned int old_hint_count = + (i == 0) ? 0 : packed_hints[MLDSA_OMEGA + i - 1]; + const unsigned int new_hint_count = packed_hints[MLDSA_OMEGA + i]; + unsigned int j; + + if (new_hint_count < old_hint_count || new_hint_count > MLDSA_OMEGA) + { + return MLD_ERR_FAIL; + } - /* Set all coefficients of all polynomials to 0. */ - /* Only those that are actually non-zero hints will */ - /* be overwritten below. */ - mld_memset(h, 0, sizeof(mld_polyveck)); + mld_memset(h, 0, sizeof(mld_poly)); - old_hint_count = 0; - for (i = 0; i < MLDSA_K; ++i) + for (j = old_hint_count; j < new_hint_count; ++j) __loop__( - invariant(i <= MLDSA_K) - /* Maintain the post-condition */ - invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) + invariant(j >= old_hint_count && j <= new_hint_count && + new_hint_count <= MLDSA_OMEGA) + invariant(array_bound(h->coeffs, 0, MLDSA_N, 0, 2)) + decreases(new_hint_count - j) ) { - /* Grab the hint count for the i'th polynomial */ - const unsigned int new_hint_count = packed_hints[MLDSA_OMEGA + i]; - - /* new_hint_count must increase or stay the same, but also remain */ - /* less than or equal to MLDSA_OMEGA */ - if (new_hint_count < old_hint_count || new_hint_count > MLDSA_OMEGA) + if (j > old_hint_count && packed_hints[j] <= packed_hints[j - 1]) { - /* Error - new_hint_count is invalid */ - return 1; + return MLD_ERR_FAIL; } + /* Safety: packed_hints[j] is uint8_t (<= 255) and MLDSA_N == 256. */ + h->coeffs[packed_hints[j]] = 1; + } - /* If new_hint_count == old_hint_count, then this polynomial has */ - /* zero hints, so this loop executes zero times and we move */ - /* straight on to the next polynomial. */ - for (j = old_hint_count; j < new_hint_count; ++j) + /* On the last row, also verify that the trailing index slots are zero. */ + if (i == MLDSA_K - 1) + { + for (j = new_hint_count; j < MLDSA_OMEGA; ++j) __loop__( - invariant(i <= MLDSA_K) - /* Maintain the post-condition */ - invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - ) + invariant(j <= MLDSA_OMEGA) + decreases(MLDSA_OMEGA - j) + ) { - const uint8_t this_hint_index = packed_hints[j]; - - /* Coefficients must be ordered for strong unforgeability */ - if (j > old_hint_count && this_hint_index <= packed_hints[j - 1]) + if (packed_hints[j] != 0) { - return 1; + return MLD_ERR_FAIL; } - h->vec[i].coeffs[this_hint_index] = 1; - } - - old_hint_count = new_hint_count; - } - - /* Extra indices must be zero for strong unforgeability */ - for (j = old_hint_count; j < MLDSA_OMEGA; ++j) - __loop__( - invariant(j <= MLDSA_OMEGA) - /* Maintain the post-condition */ - invariant(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - ) - { - if (packed_hints[j] != 0) - { - return 1; } } return 0; } - -MLD_INTERNAL_API -int mld_unpack_sig(uint8_t c[MLDSA_CTILDEBYTES], mld_polyvecl *z, - mld_polyveck *h, const uint8_t sig[MLDSA_CRYPTO_BYTES]) -{ - mld_memcpy(c, sig, MLDSA_CTILDEBYTES); - sig += MLDSA_CTILDEBYTES; - - mld_polyvecl_unpack_z(z, sig); - sig += MLDSA_L * MLDSA_POLYZ_PACKEDBYTES; - - return mld_unpack_hints(h, sig); -} +#endif /* !MLD_CONFIG_NO_VERIFY_API */ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef mld_unpack_hints diff --git a/crypto/fipsmodule/ml_dsa/mldsa/packing.h b/crypto/fipsmodule/ml_dsa/mldsa/packing.h index e0036688935..023998018c5 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/packing.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/packing.h @@ -5,115 +5,116 @@ #ifndef MLD_PACKING_H #define MLD_PACKING_H -#include #include "polyvec.h" +#include "polyvec_lazy.h" -#define mld_pack_pk MLD_NAMESPACE_KL(pack_pk) -/************************************************* - * Name: mld_pack_pk +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +#define mld_pack_sk_s1 MLD_NAMESPACE_KL(pack_sk_s1) +/** + * Bit-pack the s1 component into the secret key. * - * Description: Bit-pack public key pk = (rho, t1). - * - * Arguments: - uint8_t pk[]: output byte array - * - const uint8_t rho[]: byte array containing rho - * - const mld_polyveck *t1: pointer to vector t1 - **************************************************/ + * @param[out] sk Output byte array. + * @param[in] s1 Pointer to vector s1. + */ MLD_INTERNAL_API -void mld_pack_pk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[MLDSA_SEEDBYTES], const mld_polyveck *t1) +void mld_pack_sk_s1(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], + const mld_polyvecl *s1) __contract__( - requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) - requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) - requires(memory_no_alias(t1, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, - array_bound(t1->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10))) - assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) + requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) + requires(memory_no_alias(s1, sizeof(mld_polyvecl))) + requires(forall(k1, 0, MLDSA_L, + array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1))) + assigns(memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) ); - -#define mld_pack_sk MLD_NAMESPACE_KL(pack_sk) -/************************************************* - * Name: mld_pack_sk - * - * Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). - * - * Arguments: - uint8_t sk[]: output byte array - * - const uint8_t rho[]: byte array containing rho - * - const uint8_t tr[]: byte array containing tr - * - const uint8_t key[]: byte array containing key - * - const mld_polyveck *t0: pointer to vector t0 - * - const mld_polyvecl *s1: pointer to vector s1 - * - const mld_polyveck *s2: pointer to vector s2 - **************************************************/ +#define mld_pack_sk_rho_key_tr_s2 MLD_NAMESPACE_KL(pack_sk_rho_key_tr_s2) +/** + * Bit-pack rho, key, tr, s2 into the secret key. + * + * s1 must already be packed via mld_pack_sk_s1, and t0 via + * mld_compute_pack_t0_t1. + * + * @param[out] sk Output byte array. + * @param[in] rho Byte array containing rho. + * @param[in] tr Byte array containing tr. + * @param[in] key Byte array containing key. + * @param[in] s2 Pointer to vector s2. + */ MLD_INTERNAL_API -void mld_pack_sk(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[MLDSA_SEEDBYTES], - const uint8_t tr[MLDSA_TRBYTES], - const uint8_t key[MLDSA_SEEDBYTES], const mld_polyveck *t0, - const mld_polyvecl *s1, const mld_polyveck *s2) +void mld_pack_sk_rho_key_tr_s2(uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[MLDSA_SEEDBYTES], + const uint8_t tr[MLDSA_TRBYTES], + const uint8_t key[MLDSA_SEEDBYTES], + const mld_polyveck *s2) __contract__( requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) requires(memory_no_alias(tr, MLDSA_TRBYTES)) requires(memory_no_alias(key, MLDSA_SEEDBYTES)) - requires(memory_no_alias(t0, sizeof(mld_polyveck))) - requires(memory_no_alias(s1, sizeof(mld_polyvecl))) requires(memory_no_alias(s2, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, - array_bound(t0->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) - requires(forall(k1, 0, MLDSA_L, - array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLDSA_ETA + 1))) requires(forall(k2, 0, MLDSA_K, array_abs_bound(s2->vec[k2].coeffs, 0, MLDSA_N, MLDSA_ETA + 1))) assigns(memory_slice(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ -#define mld_pack_sig_c_h MLD_NAMESPACE_KL(pack_sig_c_h) -/************************************************* - * Name: mld_pack_sig_c_h - * - * Description: Bit-pack c and h component of sig = (c, z, h). - * The z component is packed separately using mld_pack_sig_z. - * - * Arguments: - uint8_t sig[]: output byte array - * - const uint8_t *c: pointer to challenge hash length - * MLDSA_SEEDBYTES - * - const mld_polyveck *h: pointer to hint vector h - * - const unsigned int number_of_hints: total - * hints in *h +#if !defined(MLD_CONFIG_NO_SIGN_API) +#define mld_pack_sig_c MLD_NAMESPACE_KL(pack_sig_c) +/** + * Bit-pack challenge c into sig = (c, z, h). * - * Note that the number_of_hints argument is not present - * in the reference implementation. It is added here to ease - * proof of type safety. - **************************************************/ + * @param[out] sig Output byte array. + * @param[in] c Pointer to challenge hash. + */ MLD_INTERNAL_API -void mld_pack_sig_c_h(uint8_t sig[MLDSA_CRYPTO_BYTES], - const uint8_t c[MLDSA_CTILDEBYTES], const mld_polyveck *h, - const unsigned int number_of_hints) +void mld_pack_sig_c(uint8_t sig[MLDSA_CRYPTO_BYTES], + const uint8_t c[MLDSA_CTILDEBYTES]) __contract__( requires(memory_no_alias(sig, MLDSA_CRYPTO_BYTES)) requires(memory_no_alias(c, MLDSA_CTILDEBYTES)) - requires(memory_no_alias(h, sizeof(mld_polyveck))) - requires(forall(k1, 0, MLDSA_K, - array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - requires(number_of_hints <= MLDSA_OMEGA) assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) ); +#define mld_pack_sig_h MLD_NAMESPACE_KL(pack_sig_h) +/** + * Compute hints from (w0, w1) and pack them into the hint section of sig. + * + * @param[in,out] sig Byte array containing signature. + * @param[in] w0 Pointer to low part of input vector. + * @param[in] w1 Pointer to high part of input vector. + * + * @retval 0 Success. + * @retval MLD_ERR_FAIL The total number of hints exceeds MLDSA_OMEGA. In this + * case the hint section of sig is left in a + * partially-written state and the caller must reject the + * signature. + */ +MLD_INTERNAL_API +MLD_MUST_CHECK_RETURN_VALUE +int mld_pack_sig_h(uint8_t sig[MLDSA_CRYPTO_BYTES], const mld_polyveck *w0, + const mld_polyveck *w1) +__contract__( + requires(memory_no_alias(sig, MLDSA_CRYPTO_BYTES)) + requires(memory_no_alias(w0, sizeof(mld_polyveck))) + requires(memory_no_alias(w1, sizeof(mld_polyveck))) + assigns(memory_slice( + sig + MLDSA_CTILDEBYTES + MLDSA_L * MLDSA_POLYZ_PACKEDBYTES, + MLDSA_POLYVECH_PACKEDBYTES)) + ensures(return_value == 0 || return_value == MLD_ERR_FAIL) +); + #define mld_pack_sig_z MLD_NAMESPACE_KL(pack_sig_z) -/************************************************* - * Name: mld_pack_sig_z - * - * Description: Bit-pack single polynomial of z component of sig = (c, z, h). - * The c and h components are packed separately using - * mld_pack_sig_c_h. +/** + * Bit-pack single polynomial of z component of sig = (c, z, h). * - * Arguments: - uint8_t sig[]: output byte array - * - const mld_poly *zi: pointer to a single polynomial in z - * - const unsigned int i: index of zi in vector z + * The c and h components are packed separately using mld_pack_sig_c and + * mld_pack_sig_h. * - **************************************************/ + * @param[in,out] sig Output byte array. + * @param[in] zi Pointer to a single polynomial in z. + * @param i Index of zi in vector z. + */ MLD_INTERNAL_API void mld_pack_sig_z(uint8_t sig[MLDSA_CRYPTO_BYTES], const mld_poly *zi, unsigned i) @@ -124,102 +125,119 @@ __contract__( requires(array_bound(zi->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -#define mld_unpack_pk MLD_NAMESPACE_KL(unpack_pk) -/************************************************* - * Name: mld_unpack_pk - * - * Description: Unpack public key pk = (rho, t1). - * - * Arguments: - const uint8_t rho[]: output byte array for rho - * - const mld_polyveck *t1: pointer to output vector t1 - * - uint8_t pk[]: byte array containing bit-packed pk - **************************************************/ +#if !defined(MLD_CONFIG_NO_VERIFY_API) +#define mld_unpack_pk_t1 MLD_NAMESPACE_KL(unpack_pk_t1) +/** + * Unpack a single polynomial of the t1 component of a public key + * pk = (rho, t1). + * + * @param[out] t1 Pointer to output polynomial t1[i]. + * @param[in] pk Byte array containing bit-packed pk. + * @param i Row index, must be < MLDSA_K. + */ MLD_INTERNAL_API -void mld_unpack_pk(uint8_t rho[MLDSA_SEEDBYTES], mld_polyveck *t1, - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]) +void mld_unpack_pk_t1(mld_poly *t1, + const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], + unsigned int i) __contract__( requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) - requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) - requires(memory_no_alias(t1, sizeof(mld_polyveck))) - assigns(memory_slice(rho, MLDSA_SEEDBYTES)) - assigns(memory_slice(t1, sizeof(mld_polyveck))) - ensures(forall(k0, 0, MLDSA_K, - array_bound(t1->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10))) + requires(memory_no_alias(t1, sizeof(mld_poly))) + requires(i < MLDSA_K) + assigns(memory_slice(t1, sizeof(mld_poly))) + ensures(array_bound(t1->coeffs, 0, MLDSA_N, 0, 1 << 10)) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ - +#if !defined(MLD_CONFIG_NO_SIGN_API) #define mld_unpack_sk MLD_NAMESPACE_KL(unpack_sk) -/************************************************* - * Name: mld_unpack_sk - * - * Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). - * - * Arguments: - const uint8_t rho[]: output byte array for rho - * - const uint8_t tr[]: output byte array for tr - * - const uint8_t key[]: output byte array for key - * - const mld_polyveck *t0: pointer to output vector t0 - * - const mld_polyvecl *s1: pointer to output vector s1 - * - const mld_polyveck *s2: pointer to output vector s2 - * - uint8_t sk[]: byte array containing bit-packed sk - **************************************************/ +/** + * Unpack secret key sk = (rho, tr, key, t0, s1, s2). + * + * NOTE: In REDUCE_RAM mode, s1/s2/t0 borrow from sk rather than copying. + * + * @param[out] rho Output byte array for rho. + * @param[out] tr Output byte array for tr. + * @param[out] key Output byte array for key. + * @param[out] t0 Pointer to output vector t0. + * @param[out] s1 Pointer to output vector s1. + * @param[out] s2 Pointer to output vector s2. + * @param[in] sk Byte array containing bit-packed sk. + */ MLD_INTERNAL_API void mld_unpack_sk(uint8_t rho[MLDSA_SEEDBYTES], uint8_t tr[MLDSA_TRBYTES], - uint8_t key[MLDSA_SEEDBYTES], mld_polyveck *t0, - mld_polyvecl *s1, mld_polyveck *s2, + uint8_t key[MLDSA_SEEDBYTES], mld_sk_t0hat *t0, + mld_sk_s1hat *s1, mld_sk_s2hat *s2, const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]) __contract__( requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) requires(memory_no_alias(tr, MLDSA_TRBYTES)) requires(memory_no_alias(key, MLDSA_SEEDBYTES)) - requires(memory_no_alias(t0, sizeof(mld_polyveck))) - requires(memory_no_alias(s1, sizeof(mld_polyvecl))) - requires(memory_no_alias(s2, sizeof(mld_polyveck))) + requires(memory_no_alias(t0, sizeof(mld_sk_t0hat))) + requires(memory_no_alias(s1, sizeof(mld_sk_s1hat))) + requires(memory_no_alias(s2, sizeof(mld_sk_s2hat))) requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) assigns(memory_slice(rho, MLDSA_SEEDBYTES)) assigns(memory_slice(tr, MLDSA_TRBYTES)) assigns(memory_slice(key, MLDSA_SEEDBYTES)) - assigns(memory_slice(t0, sizeof(mld_polyveck))) - assigns(memory_slice(s1, sizeof(mld_polyvecl))) - assigns(memory_slice(s2, sizeof(mld_polyveck))) - ensures(forall(k0, 0, MLDSA_K, - array_bound(t0->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) - ensures(forall(k1, 0, MLDSA_L, - array_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) - ensures(forall(k2, 0, MLDSA_K, - array_bound(s2->vec[k2].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) + assigns(memory_slice(t0, sizeof(mld_sk_t0hat))) + assigns(memory_slice(s1, sizeof(mld_sk_s1hat))) + assigns(memory_slice(s2, sizeof(mld_sk_s2hat))) + MLD_IF_NOT_REDUCE_RAM( + ensures(forall(k0, 0, MLDSA_K, + array_abs_bound(t0->vec.vec[k0].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ensures(forall(k1, 0, MLDSA_L, + array_abs_bound(s1->vec.vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ensures(forall(k2, 0, MLDSA_K, + array_abs_bound(s2->vec.vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ) + MLD_IF_REDUCE_RAM( + ensures(s1->packed == old(sk) + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES) + ensures(s2->packed == old(sk) + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES) + ensures(t0->packed == old(sk) + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + + (MLDSA_L + MLDSA_K) * MLDSA_POLYETA_PACKEDBYTES) + ) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -#define mld_unpack_sig MLD_NAMESPACE_KL(unpack_sig) -/************************************************* - * Name: mld_unpack_sig - * - * Description: Unpack signature sig = (c, z, h). - * - * Arguments: - uint8_t *c: pointer to output challenge hash - * - mld_polyvecl *z: pointer to output vector z - * - mld_polyveck *h: pointer to output hint vector h - * - const uint8_t sig[]: byte array containing - * bit-packed signature - * - * Returns 1 in case of malformed signature; otherwise 0. - **************************************************/ -MLD_MUST_CHECK_RETURN_VALUE +#if !defined(MLD_CONFIG_NO_VERIFY_API) +#define mld_sig_unpack_hints MLD_NAMESPACE_KL(sig_unpack_hints) +/** + * Decode and validate a single row of the hint vector h from a signature + * buffer. + * + * The hint encoding is shared across all rows (a count array followed by a + * single index list), so this function performs the validation relevant to + * row i: + * - the i'th hint count is non-decreasing and bounded by MLDSA_OMEGA; + * - the indices for row i are strictly ascending; + * - on i == MLDSA_K - 1, the trailing index slots are zero. + * + * Callers must invoke this for every i in [0, 1, .., MLDSA_K - 1]; if any + * call returns MLD_ERR_FAIL the encoding is malformed and the signature must + * be rejected. + * + * @param[out] h Pointer to output polynomial h[i]. + * @param[in] sig Signature buffer. + * @param i Row index, must be < MLDSA_K. + * + * @retval 0 Hints were decoded successfully. + * @retval MLD_ERR_FAIL Hints are malformed. + */ MLD_INTERNAL_API -int mld_unpack_sig(uint8_t c[MLDSA_CTILDEBYTES], mld_polyvecl *z, - mld_polyveck *h, const uint8_t sig[MLDSA_CRYPTO_BYTES]) +MLD_MUST_CHECK_RETURN_VALUE +int mld_sig_unpack_hints(mld_poly *h, const uint8_t sig[MLDSA_CRYPTO_BYTES], + unsigned int i) __contract__( requires(memory_no_alias(sig, MLDSA_CRYPTO_BYTES)) - requires(memory_no_alias(c, MLDSA_CTILDEBYTES)) - requires(memory_no_alias(z, sizeof(mld_polyvecl))) - requires(memory_no_alias(h, sizeof(mld_polyveck))) - assigns(memory_slice(c, MLDSA_CTILDEBYTES)) - assigns(memory_slice(z, sizeof(mld_polyvecl))) - assigns(memory_slice(h, sizeof(mld_polyveck))) - ensures(forall(k0, 0, MLDSA_L, - array_bound(z->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) - ensures(forall(k1, 0, MLDSA_K, - array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - ensures(return_value >= 0 && return_value <= 1) + requires(memory_no_alias(h, sizeof(mld_poly))) + requires(i < MLDSA_K) + assigns(memory_slice(h, sizeof(mld_poly))) + ensures(return_value == 0 || return_value == MLD_ERR_FAIL) + ensures(return_value == 0 ==> array_bound(h->coeffs, 0, MLDSA_N, 0, 2)) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ + #endif /* !MLD_PACKING_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly.c b/crypto/fipsmodule/ml_dsa/mldsa/poly.c index 9004f11581a..09f3ab00bc7 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/poly.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/poly.c @@ -18,13 +18,11 @@ * https://github.com/pq-crystals/dilithium/tree/master/ref */ -#include -#include +#include "poly.h" #include "common.h" #include "ct.h" #include "debug.h" -#include "poly.h" #include "reduce.h" #include "rounding.h" #include "symmetric.h" @@ -42,7 +40,8 @@ void mld_poly_reduce(mld_poly *a) __loop__( invariant(i <= MLDSA_N) invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0])) - invariant(array_bound(a->coeffs, 0, i, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX))) + invariant(array_bound(a->coeffs, 0, i, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX)) + decreases(MLDSA_N - i)) { a->coeffs[i] = mld_reduce32(a->coeffs[i]); } @@ -67,6 +66,7 @@ __contract__( invariant(i <= MLDSA_N) invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0])) invariant(array_bound(a->coeffs, 0, i, 0, MLDSA_Q)) + decreases(MLDSA_N - i) ) { a->coeffs[i] = mld_caddq(a->coeffs[i]); @@ -91,6 +91,8 @@ void mld_poly_caddq(mld_poly *a) mld_poly_caddq_c(a); } +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) /* Reference: We use destructive version (output=first input) to avoid * reasoning about aliasing in the CBMC specification */ MLD_INTERNAL_API @@ -105,12 +107,16 @@ void mld_poly_add(mld_poly *r, const mld_poly *b) invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])) invariant(forall(k2, 0, i, r->coeffs[k2] < MLD_REDUCE32_DOMAIN_MAX)) invariant(forall(k2, 0, i, r->coeffs[k2] >= INT32_MIN)) + decreases(MLDSA_N - i) ) { r->coeffs[i] = r->coeffs[i] + b->coeffs[i]; } } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) /* Reference: We use destructive version (output=first input) to avoid * reasoning about aliasing in the CBMC specification */ MLD_INTERNAL_API @@ -125,6 +131,7 @@ void mld_poly_sub(mld_poly *r, const mld_poly *b) invariant(i <= MLDSA_N) invariant(array_bound(r->coeffs, 0, i, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX)) invariant(forall(k0, i, MLDSA_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0])) + decreases(MLDSA_N - i) ) { r->coeffs[i] = r->coeffs[i] - b->coeffs[i]; @@ -132,7 +139,9 @@ void mld_poly_sub(mld_poly *r, const mld_poly *b) mld_assert_bound(r->coeffs, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX); } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API void mld_poly_shiftl(mld_poly *a) { @@ -143,7 +152,8 @@ void mld_poly_shiftl(mld_poly *a) __loop__( invariant(i <= MLDSA_N) invariant(array_bound(a->coeffs, 0, i, 0, MLDSA_Q)) - invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0]))) + invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0])) + decreases(MLDSA_N - i)) { /* Reference: uses a left shift by MLDSA_D which is undefined behaviour in * C90/C99 @@ -152,7 +162,7 @@ void mld_poly_shiftl(mld_poly *a) } mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); } - +#endif /* !MLD_CONFIG_NO_VERIFY_API */ static MLD_INLINE int32_t mld_fqmul(int32_t a, int32_t b) __contract__( @@ -231,7 +241,8 @@ __contract__( invariant(array_abs_bound(r, 0, j, bound + MLDSA_Q)) invariant(array_abs_bound(r, j, start + len, bound)) invariant(array_abs_bound(r, start + len, j + len, bound + MLDSA_Q)) - invariant(array_abs_bound(r, j + len, MLDSA_N, bound))) + invariant(array_abs_bound(r, j + len, MLDSA_N, bound)) + decreases(start + len - j)) { int32_t t; t = mld_fqmul(r[j + len], zeta); @@ -268,7 +279,8 @@ __contract__( invariant(k <= MLDSA_N) invariant(2 * len * k == start + MLDSA_N) invariant(array_abs_bound(r, 0, start, layer * MLDSA_Q + MLDSA_Q)) - invariant(array_abs_bound(r, start, MLDSA_N, layer * MLDSA_Q))) + invariant(array_abs_bound(r, start, MLDSA_N, layer * MLDSA_Q)) + decreases(MLDSA_N - start)) { int32_t zeta = mld_zetas[k++]; mld_ntt_butterfly_block(r, zeta, start, len, layer * MLDSA_Q); @@ -294,6 +306,7 @@ __contract__( __loop__( invariant(1 <= layer && layer <= 9) invariant(array_abs_bound(r, 0, MLDSA_N, layer * MLDSA_Q)) + decreases(9 - layer) ) { mld_ntt_layer(r, layer); @@ -318,17 +331,15 @@ void mld_poly_ntt(mld_poly *a) mld_poly_ntt_c(a); } -/************************************************* - * Name: mld_fqscale +/** + * Scale a field element by mont/256, i.e., perform Montgomery multiplication + * by mont^2/256. * - * Description: Scales a field element by mont/256 , i.e., performs Montgomery - * multiplication by mont^2/256. - * Input is expected to have absolute value smaller than - * 256 * MLDSA_Q. - * Output has absolute value smaller than MLD_INTT_BOUND. + * Input is expected to have absolute value smaller than 256 * MLDSA_Q. Output + * has absolute value smaller than MLD_INTT_BOUND. * - * Arguments: - int32_t a: Field element to be scaled. - **************************************************/ + * @param a Field element to be scaled. + */ static MLD_INLINE int32_t mld_fqscale(int32_t a) __contract__( requires(a > -256*MLDSA_Q && a < 256*MLDSA_Q) @@ -360,7 +371,8 @@ __contract__( invariant(start <= MLDSA_N && k <= 255) invariant(2 * len * k + start == 2 * MLDSA_N - 2 * len) invariant(array_abs_bound(r, 0, start, (MLDSA_N >> (layer - 1)) * MLDSA_Q)) - invariant(array_abs_bound(r, start, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q))) + invariant(array_abs_bound(r, start, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)) + decreases(MLDSA_N - start)) { unsigned j; int32_t zeta = -mld_zetas[k--]; @@ -371,7 +383,8 @@ __contract__( invariant(array_abs_bound(r, 0, start, (MLDSA_N >> (layer - 1)) * MLDSA_Q)) invariant(array_abs_bound(r, start, j, (MLDSA_N >> (layer - 1)) * MLDSA_Q)) invariant(array_abs_bound(r, j, start + len, (MLDSA_N >> layer) * MLDSA_Q)) - invariant(array_abs_bound(r, start + len, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q))) + invariant(array_abs_bound(r, start + len, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)) + decreases(start + len - j)) { int32_t t = r[j]; r[j] = t + r[j + len]; @@ -400,7 +413,8 @@ __contract__( invariant(layer <= 8) /* Absolute bounds increase from 1Q before layer 8 */ /* up to 256Q after layer 1 */ - invariant(array_abs_bound(r, 0, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q))) + invariant(array_abs_bound(r, 0, MLDSA_N, (MLDSA_N >> layer) * MLDSA_Q)) + decreases(layer)) { mld_invntt_layer(r, layer); } @@ -416,6 +430,7 @@ __contract__( invariant(j <= MLDSA_N) invariant(array_abs_bound(r, 0, j, MLD_INTT_BOUND)) invariant(array_abs_bound(r, j, MLDSA_N, MLDSA_N * MLDSA_Q)) + decreases(MLDSA_N - j) ) { r[j] = mld_fqscale(r[j]); @@ -441,17 +456,17 @@ void mld_poly_invntt_tomont(mld_poly *a) mld_poly_invntt_tomont_c(a); } -MLD_STATIC_TESTABLE void mld_poly_pointwise_montgomery_c(mld_poly *c, - const mld_poly *a, +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +MLD_STATIC_TESTABLE void mld_poly_pointwise_montgomery_c(mld_poly *a, const mld_poly *b) __contract__( requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(b, sizeof(mld_poly))) - requires(memory_no_alias(c, sizeof(mld_poly))) requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - assigns(memory_slice(c, sizeof(mld_poly))) - ensures(array_abs_bound(c->coeffs, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q)) ) { unsigned int i; @@ -461,32 +476,36 @@ __contract__( for (i = 0; i < MLDSA_N; ++i) __loop__( invariant(i <= MLDSA_N) - invariant(array_abs_bound(c->coeffs, 0, i, MLDSA_Q)) + invariant(array_abs_bound(a->coeffs, 0, i, MLDSA_Q)) + invariant(array_abs_bound(a->coeffs, i, MLDSA_N, MLD_NTT_BOUND)) + decreases(MLDSA_N - i) ) { - c->coeffs[i] = mld_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + a->coeffs[i] = mld_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); } - mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q); + mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q); } MLD_INTERNAL_API -void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a, - const mld_poly *b) +void mld_poly_pointwise_montgomery(mld_poly *a, const mld_poly *b) { #if defined(MLD_USE_NATIVE_POINTWISE_MONTGOMERY) int ret; mld_assert_abs_bound(a->coeffs, MLDSA_N, MLD_NTT_BOUND); mld_assert_abs_bound(b->coeffs, MLDSA_N, MLD_NTT_BOUND); - ret = mld_poly_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs); + ret = mld_poly_pointwise_montgomery_native(a->coeffs, b->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { - mld_assert_abs_bound(c->coeffs, MLDSA_N, MLDSA_Q); + mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q); return; } #endif /* MLD_USE_NATIVE_POINTWISE_MONTGOMERY */ - mld_poly_pointwise_montgomery_c(c, a, b); + mld_poly_pointwise_montgomery_c(a, b); } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a) { @@ -497,8 +516,10 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a) __loop__( assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly))) invariant(i <= MLDSA_N) + invariant(forall(k0, i, MLDSA_N, a->coeffs[k0] == loop_entry(*a).coeffs[k0])) invariant(array_bound(a0->coeffs, 0, i, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)) invariant(array_bound(a1->coeffs, 0, i, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)) + decreases(MLDSA_N - i) ) { mld_power2round(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]); @@ -508,9 +529,12 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a) (MLD_2_POW_D / 2) + 1); mld_assert_bound(a1->coeffs, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1); } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#ifndef MLD_POLY_UNIFORM_NBLOCKS #define MLD_POLY_UNIFORM_NBLOCKS \ ((768 + MLD_STREAM128_BLOCKBYTES - 1) / MLD_STREAM128_BLOCKBYTES) +#endif /* Reference: `mld_rej_uniform()` in the reference implementation @[REF]. * - Our signature differs from the reference implementation * in that it adds the offset and always expects the base of the @@ -543,7 +567,8 @@ __contract__( while (ctr < target && pos + 3 <= buflen) __loop__( invariant(offset <= ctr && ctr <= target && pos <= buflen) - invariant(array_bound(a, 0, ctr, 0, MLDSA_Q))) + invariant(array_bound(a, 0, ctr, 0, MLDSA_Q)) + decreases(buflen - pos)) { t = buf[pos++]; t |= (uint32_t)buf[pos++] << 8; @@ -560,23 +585,19 @@ __contract__( return ctr; } -/************************************************* - * Name: mld_rej_uniform - * - * Description: Sample uniformly random coefficients in [0, MLDSA_Q-1] by - * performing rejection sampling on array of random bytes. +/** + * Sample uniformly random coefficients in [0, MLDSA_Q-1] by performing + * rejection sampling on an array of random bytes. * - * Arguments: - int32_t *a: pointer to output array (allocated) - * - unsigned int target: requested number of coefficients to - *sample - * - unsigned int offset: number of coefficients already sampled - * - const uint8_t *buf: array of random bytes to sample from - * - unsigned int buflen: length of array of random bytes (must be - * multiple of 3) + * @param[out] a Pointer to output array (allocated). + * @param target Requested number of coefficients to sample. + * @param offset Number of coefficients already sampled. + * @param[in] buf Array of random bytes to sample from. + * @param buflen Length of array of random bytes (must be multiple of 3). * - * Returns number of sampled coefficients. Can be smaller than len if not enough - * random bytes were given. - **************************************************/ + * @return Number of sampled coefficients. Can be smaller than len if not + * enough random bytes were given. + */ /* Reference: `mld_rej_uniform()` in the reference implementation @[REF]. * - Our signature differs from the reference implementation @@ -655,7 +676,8 @@ void mld_poly_uniform(mld_poly *a, const uint8_t seed[MLDSA_SEEDBYTES + 2]) mld_zeroize(buf, sizeof(buf)); } -#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && !defined(MLD_CONFIG_REDUCE_RAM) +#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) MLD_INTERNAL_API void mld_poly_uniform_4x(mld_poly *vec0, mld_poly *vec1, mld_poly *vec2, mld_poly *vec3, @@ -720,8 +742,10 @@ void mld_poly_uniform_4x(mld_poly *vec0, mld_poly *vec1, mld_poly *vec2, mld_zeroize(buf, sizeof(buf)); } -#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && !MLD_CONFIG_REDUCE_RAM */ +#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST) */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API void mld_polyt1_pack(uint8_t r[MLDSA_POLYT1_PACKEDBYTES], const mld_poly *a) { @@ -730,7 +754,8 @@ void mld_polyt1_pack(uint8_t r[MLDSA_POLYT1_PACKEDBYTES], const mld_poly *a) for (i = 0; i < MLDSA_N / 4; ++i) __loop__( - invariant(i <= MLDSA_N/4)) + invariant(i <= MLDSA_N/4) + decreases(MLDSA_N / 4 - i)) { r[5 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0] >> 0) & 0xFF); r[5 * i + 1] = @@ -745,7 +770,9 @@ void mld_polyt1_pack(uint8_t r[MLDSA_POLYT1_PACKEDBYTES], const mld_poly *a) r[5 * i + 4] = (uint8_t)((a->coeffs[4 * i + 3] >> 2) & 0xFF); } } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API void mld_polyt1_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT1_PACKEDBYTES]) { @@ -754,7 +781,8 @@ void mld_polyt1_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT1_PACKEDBYTES]) for (i = 0; i < MLDSA_N / 4; ++i) __loop__( invariant(i <= MLDSA_N/4) - invariant(array_bound(r->coeffs, 0, i*4, 0, 1 << 10))) + invariant(array_bound(r->coeffs, 0, i*4, 0, 1 << 10)) + decreases(MLDSA_N / 4 - i)) { r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((int32_t)a[5 * i + 1] << 8)) & 0x3FF; @@ -768,7 +796,9 @@ void mld_polyt1_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT1_PACKEDBYTES]) mld_assert_bound(r->coeffs, MLDSA_N, 0, 1 << 10); } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API void mld_polyt0_pack(uint8_t r[MLDSA_POLYT0_PACKEDBYTES], const mld_poly *a) { @@ -780,7 +810,8 @@ void mld_polyt0_pack(uint8_t r[MLDSA_POLYT0_PACKEDBYTES], const mld_poly *a) for (i = 0; i < MLDSA_N / 8; ++i) __loop__( - invariant(i <= MLDSA_N/8)) + invariant(i <= MLDSA_N/8) + decreases(MLDSA_N / 8 - i)) { /* Safety: a->coeffs[i] <= (1 << (MLDSA_D - 1) as they are output of * power2round, hence, these casts are safe. */ @@ -815,7 +846,9 @@ void mld_polyt0_pack(uint8_t r[MLDSA_POLYT0_PACKEDBYTES], const mld_poly *a) r[13 * i + 12] = (uint8_t)((t[7] >> 5) & 0xFF); } } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) MLD_INTERNAL_API void mld_polyt0_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT0_PACKEDBYTES]) { @@ -824,7 +857,8 @@ void mld_polyt0_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT0_PACKEDBYTES]) for (i = 0; i < MLDSA_N / 8; ++i) __loop__( invariant(i <= MLDSA_N/8) - invariant(array_bound(r->coeffs, 0, i*8, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) + invariant(array_bound(r->coeffs, 0, i*8, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)) + decreases(MLDSA_N / 8 - i)) { r->coeffs[8 * i + 0] = a[13 * i + 0]; r->coeffs[8 * i + 0] |= (int32_t)a[13 * i + 1] << 8; @@ -875,6 +909,7 @@ void mld_polyt0_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT0_PACKEDBYTES]) mld_assert_bound(r->coeffs, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1, (1 << (MLDSA_D - 1)) + 1); } +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ MLD_STATIC_TESTABLE uint32_t mld_poly_chknorm_c(const mld_poly *a, int32_t B) __contract__( @@ -894,6 +929,7 @@ __contract__( invariant(i <= MLDSA_N) invariant(t == 0 || t == 0xFFFFFFFF) invariant((t == 0) == array_abs_bound(a->coeffs, 0, i, B)) + decreases(MLDSA_N - i) ) { /* @@ -956,7 +992,7 @@ uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B) if (success) { /* Convert 0 / 1 to 0 / 0xFFFFFFFF here */ - return 0U - (uint32_t)ret; + return mld_ct_cmask_nonzero_u32((uint32_t)ret); } #endif /* MLD_USE_NATIVE_POLY_CHKNORM */ return mld_poly_chknorm_c(a, B); diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly.h b/crypto/fipsmodule/ml_dsa/mldsa/poly.h index ff7710438ee..ddde0f05445 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/poly.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/poly.h @@ -5,7 +5,6 @@ #ifndef MLD_POLY_H #define MLD_POLY_H -#include #include "cbmc.h" #include "common.h" #include "reduce.h" @@ -16,21 +15,22 @@ /* Absolute exclusive upper bound for the output of the inverse NTT*/ #define MLD_INTT_BOUND MLDSA_Q +/** + * Element of R_q = Z_q[X]/(X^n + 1). Represents polynomial + * coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]. + */ typedef struct { - int32_t coeffs[MLDSA_N]; + int32_t coeffs[MLDSA_N]; /**< Polynomial coefficients. */ } MLD_ALIGN mld_poly; #define mld_poly_reduce MLD_NAMESPACE(poly_reduce) -/************************************************* - * Name: mld_poly_reduce - * - * Description: Inplace reduction of all coefficients of polynomial to - * representative in - *[-MLD_REDUCE32_RANGE_MAX,MLD_REDUCE32_RANGE_MAX]. +/** + * In-place reduction of all coefficients of polynomial to representative in + * [-MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX]. * - * Arguments: - mld_poly *a: pointer to input/output polynomial - **************************************************/ + * @param[in,out] a Pointer to input/output polynomial. + */ MLD_INTERNAL_API void mld_poly_reduce(mld_poly *a) __contract__( @@ -41,14 +41,12 @@ __contract__( ); #define mld_poly_caddq MLD_NAMESPACE(poly_caddq) -/************************************************* - * Name: mld_poly_caddq +/** + * For all coefficients of in/out polynomial add MLDSA_Q if coefficient is + * negative. * - * Description: For all coefficients of in/out polynomial add MLDSA_Q if - * coefficient is negative. - * - * Arguments: - mld_poly *a: pointer to input/output polynomial - **************************************************/ + * @param[in,out] a Pointer to input/output polynomial. + */ MLD_INTERNAL_API void mld_poly_caddq(mld_poly *a) __contract__( @@ -58,16 +56,16 @@ __contract__( ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) ); +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_add MLD_NAMESPACE(poly_add) -/************************************************* - * Name: mld_poly_add +/** + * Add polynomials. No modular reduction is performed. * - * Description: Add polynomials. No modular reduction is performed. - * - * Arguments: - r: Pointer to input-output polynomial to be added to. - * - b: Pointer to input polynomial that should be added - * to r. Must be disjoint from r. - **************************************************/ + * @param[in,out] r Pointer to input-output polynomial to be added to. + * @param[in] b Pointer to input polynomial that should be added to r. + * Must be disjoint from r. + */ /* * NOTE: The reference implementation uses a 3-argument poly_add. @@ -85,18 +83,18 @@ __contract__( ensures(forall(k3, 0, MLDSA_N, r->coeffs[k3] < MLD_REDUCE32_DOMAIN_MAX)) ensures(forall(k4, 0, MLDSA_N, r->coeffs[k4] >= INT32_MIN)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_poly_sub MLD_NAMESPACE(poly_sub) -/************************************************* - * Name: mld_poly_sub - * - * Description: Subtract polynomials. No modular reduction is - * performed. +/** + * Subtract polynomials. No modular reduction is performed. * - * Arguments: - mld_poly *r: Pointer to input-output polynomial. - * - const mld_poly *b: Pointer to input polynomial that should be - * subtracted from r. Must be disjoint from r. - **************************************************/ + * @param[in,out] r Pointer to input-output polynomial. + * @param[in] b Pointer to input polynomial that should be subtracted from + * r. Must be disjoint from r. + */ /* * NOTE: The reference implementation uses a 3-argument poly_sub. * We specialize to the accumulator form to avoid reasoning about aliasing. @@ -111,16 +109,16 @@ __contract__( assigns(memory_slice(r, sizeof(mld_poly))) ensures(array_bound(r->coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_poly_shiftl MLD_NAMESPACE(poly_shiftl) -/************************************************* - * Name: mld_poly_shiftl - * - * Description: Multiply polynomial by 2^MLDSA_D without modular reduction. - *Assumes input coefficients to be less than 2^{31-MLDSA_D} in absolute value. +/** + * Multiply polynomial by 2^MLDSA_D without modular reduction. Assumes input + * coefficients to be less than 2^{31-MLDSA_D} in absolute value. * - * Arguments: - mld_poly *a: pointer to input/output polynomial - **************************************************/ + * @param[in,out] a Pointer to input/output polynomial. + */ MLD_INTERNAL_API void mld_poly_shiftl(mld_poly *a) __contract__( @@ -129,16 +127,14 @@ __contract__( assigns(memory_slice(a, sizeof(mld_poly))) ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ #define mld_poly_ntt MLD_NAMESPACE(poly_ntt) -/************************************************* - * Name: mld_poly_ntt - * - * Description: Inplace forward NTT. Coefficients can grow by - * 8*MLDSA_Q in absolute value. +/** + * In-place forward NTT. Coefficients can grow by 8*MLDSA_Q in absolute value. * - * Arguments: - mld_poly *a: pointer to input/output polynomial - **************************************************/ + * @param[in,out] a Pointer to input/output polynomial. + */ MLD_INTERNAL_API void mld_poly_ntt(mld_poly *a) __contract__( @@ -150,16 +146,14 @@ __contract__( #define mld_poly_invntt_tomont MLD_NAMESPACE(poly_invntt_tomont) -/************************************************* - * Name: mld_poly_invntt_tomont +/** + * In-place inverse NTT and multiplication by 2^{32}. * - * Description: Inplace inverse NTT and multiplication by 2^{32}. - * Input coefficients need to be less than MLDSA_Q in absolute - * value and output coefficients are bounded by - * MLD_INTT_BOUND. + * Input coefficients need to be less than MLDSA_Q in absolute value and + * output coefficients are bounded by MLD_INTT_BOUND. * - * Arguments: - mld_poly *a: pointer to input/output polynomial - **************************************************/ + * @param[in,out] a Pointer to input/output polynomial. + */ MLD_INTERNAL_API void mld_poly_invntt_tomont(mld_poly *a) __contract__( @@ -169,71 +163,69 @@ __contract__( ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_INTT_BOUND)) ); +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_pointwise_montgomery MLD_NAMESPACE(poly_pointwise_montgomery) -/************************************************* - * Name: mld_poly_pointwise_montgomery - * - * Description: Pointwise multiplication of polynomials in NTT domain - * representation and multiplication of resulting polynomial - * by 2^{-32}. - * - * Arguments: - mld_poly *c: pointer to output polynomial - * - const mld_poly *a: pointer to first input polynomial - * - const mld_poly *b: pointer to second input polynomial - **************************************************/ +/** + * Pointwise multiplication of polynomials in NTT domain representation and + * multiplication of resulting polynomial by 2^{-32}. Destructive in the first + * argument. + * + * @param[in,out] a Pointer to first input/output polynomial. On entry, holds + * the first multiplicand; on exit, holds the product + * a * b * 2^{-32}. + * @param[in] b Pointer to second input polynomial. + */ MLD_INTERNAL_API -void mld_poly_pointwise_montgomery(mld_poly *c, const mld_poly *a, - const mld_poly *b) +void mld_poly_pointwise_montgomery(mld_poly *a, const mld_poly *b) __contract__( requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(b, sizeof(mld_poly))) - requires(memory_no_alias(c, sizeof(mld_poly))) requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) requires(array_abs_bound(b->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - assigns(memory_slice(c, sizeof(mld_poly))) - ensures(array_abs_bound(c->coeffs, 0, MLDSA_N, MLDSA_Q)) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_Q)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_poly_power2round MLD_NAMESPACE(poly_power2round) -/************************************************* - * Name: mld_poly_power2round - * - * Description: For all coefficients c of the input polynomial, - * compute c0, c1 such that c mod MLDSA_Q = c1*2^MLDSA_D + c0 - * with -2^{MLDSA_D-1} < c0 <= 2^{MLDSA_D-1}. Assumes coefficients - *to be standard representatives. - * - * Arguments: - mld_poly *a1: pointer to output polynomial with coefficients - *c1 - * - mld_poly *a0: pointer to output polynomial with coefficients - *c0 - * - const mld_poly *a: pointer to input polynomial - **************************************************/ +/** + * For all coefficients c of the input polynomial, compute c0, c1 such that + * c mod MLDSA_Q = c1*2^MLDSA_D + c0 with -2^{MLDSA_D-1} < c0 <= 2^{MLDSA_D-1}. + * Assumes coefficients to be standard representatives. + * + * @param[out] a1 Pointer to output polynomial with coefficients c1. + * @param[out] a0 Pointer to output polynomial with coefficients c0; may alias + * the input polynomial a. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a) __contract__( requires(memory_no_alias(a0, sizeof(mld_poly))) requires(memory_no_alias(a1, sizeof(mld_poly))) - requires(memory_no_alias(a, sizeof(mld_poly))) + /* The implementation does not require a0 == a, but the single call site + * aliases them and asserting equality simplifies the proof. */ + requires(a0 == a) requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) assigns(memory_slice(a1, sizeof(mld_poly))) assigns(memory_slice(a0, sizeof(mld_poly))) ensures(array_bound(a0->coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1)) ensures(array_bound(a1->coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ #define mld_poly_uniform MLD_NAMESPACE(poly_uniform) -/************************************************* - * Name: mld_poly_uniform - * - * Description: Sample polynomial with uniformly random coefficients - * in [0,MLDSA_Q-1] by performing rejection sampling on the - * output stream of SHAKE128(seed|nonce) +/** + * Sample polynomial with uniformly random coefficients in [0, MLDSA_Q-1] by + * performing rejection sampling on the output stream of SHAKE128(seed|nonce). * - * Arguments: - mld_poly *a: pointer to output polynomial - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_SEEDBYTES and the packed 2-byte nonce - **************************************************/ + * @param[out] a Pointer to output polynomial. + * @param[in] seed Byte array with seed of length MLDSA_SEEDBYTES and the + * packed 2-byte nonce. + */ MLD_INTERNAL_API void mld_poly_uniform(mld_poly *a, const uint8_t seed[MLDSA_SEEDBYTES + 2]) __contract__( @@ -243,21 +235,20 @@ __contract__( ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) ); -#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && !defined(MLD_CONFIG_REDUCE_RAM) +#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) #define mld_poly_uniform_4x MLD_NAMESPACE(poly_uniform_4x) -/************************************************* - * Name: mld_poly_uniform_x4 - * - * Description: Generate four polynomials using rejection sampling - * on (pseudo-)uniformly random bytes sampled from a seed. - * - * Arguments: - mld_poly *vec0, *vec1, *vec2, *vec3: - * Pointers to 4 polynomials to be sampled. - * - uint8_t seed[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]: - * Pointer consecutive array of seed buffers of size - * MLDSA_SEEDBYTES + 2 each, plus padding for alignment. - * - **************************************************/ +/** + * Generate four polynomials using rejection sampling on (pseudo-)uniformly + * random bytes sampled from a seed. + * + * @param[out] vec0 Pointer to first polynomial to be sampled. + * @param[out] vec1 Pointer to second polynomial to be sampled. + * @param[out] vec2 Pointer to third polynomial to be sampled. + * @param[out] vec3 Pointer to fourth polynomial to be sampled. + * @param[in] seed Pointer to consecutive array of seed buffers of size + * MLDSA_SEEDBYTES + 2 each, plus padding for alignment. + */ MLD_INTERNAL_API void mld_poly_uniform_4x(mld_poly *vec0, mld_poly *vec1, mld_poly *vec2, mld_poly *vec3, @@ -277,19 +268,19 @@ __contract__( ensures(array_bound(vec2->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) ensures(array_bound(vec3->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) ); -#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && !MLD_CONFIG_REDUCE_RAM */ +#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST) */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_polyt1_pack MLD_NAMESPACE(polyt1_pack) -/************************************************* - * Name: mld_polyt1_pack +/** + * Bit-pack polynomial t1 with coefficients fitting in 10 bits. Input + * coefficients are assumed to be standard representatives. * - * Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. - * Input coefficients are assumed to be standard representatives. - * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_POLYT1_PACKEDBYTES bytes - * - const mld_poly *a: pointer to input polynomial - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_POLYT1_PACKEDBYTES bytes. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_polyt1_pack(uint8_t r[MLDSA_POLYT1_PACKEDBYTES], const mld_poly *a) __contract__( @@ -298,17 +289,17 @@ __contract__( requires(array_bound(a->coeffs, 0, MLDSA_N, 0, 1 << 10)) assigns(memory_slice(r, MLDSA_POLYT1_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_polyt1_unpack MLD_NAMESPACE(polyt1_unpack) -/************************************************* - * Name: mld_polyt1_unpack +/** + * Unpack polynomial t1 with 10-bit coefficients. Output coefficients are + * standard representatives. * - * Description: Unpack polynomial t1 with 10-bit coefficients. - * Output coefficients are standard representatives. - * - * Arguments: - mld_poly *r: pointer to output polynomial - * - const uint8_t *a: byte array with bit-packed polynomial - **************************************************/ + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ MLD_INTERNAL_API void mld_polyt1_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT1_PACKEDBYTES]) __contract__( @@ -317,18 +308,17 @@ __contract__( assigns(memory_slice(r, sizeof(mld_poly))) ensures(array_bound(r->coeffs, 0, MLDSA_N, 0, 1 << 10)) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_polyt0_pack MLD_NAMESPACE(polyt0_pack) -/************************************************* - * Name: mld_polyt0_pack - * - * Description: Bit-pack polynomial t0 with coefficients in ]-2^{MLDSA_D-1}, - * 2^{MLDSA_D-1}]. +/** + * Bit-pack polynomial t0 with coefficients in ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}]. * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_POLYT0_PACKEDBYTES bytes - * - const mld_poly *a: pointer to input polynomial - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_POLYT0_PACKEDBYTES bytes. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_polyt0_pack(uint8_t r[MLDSA_POLYT0_PACKEDBYTES], const mld_poly *a) __contract__( @@ -337,18 +327,16 @@ __contract__( requires(array_bound(a->coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)) assigns(memory_slice(r, MLDSA_POLYT0_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ - +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) #define mld_polyt0_unpack MLD_NAMESPACE(polyt0_unpack) -/************************************************* - * Name: mld_polyt0_unpack +/** + * Unpack polynomial t0 with coefficients in ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}]. * - * Description: Unpack polynomial t0 with coefficients in ]-2^{MLDSA_D-1}, - *2^{MLDSA_D-1}]. - * - * Arguments: - mld_poly *r: pointer to output polynomial - * - const uint8_t *a: byte array with bit-packed polynomial - **************************************************/ + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ MLD_INTERNAL_API void mld_polyt0_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYT0_PACKEDBYTES]) __contract__( @@ -357,30 +345,27 @@ __contract__( assigns(memory_slice(r, sizeof(mld_poly))) ensures(array_bound(r->coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ #define mld_poly_chknorm MLD_NAMESPACE(poly_chknorm) -/************************************************* - * Name: mld_poly_chknorm - * - * Description: Check infinity norm of polynomial against given bound. - * Assumes input coefficients were reduced by mld_reduce32(). +/** + * Check infinity norm of polynomial against given bound. Assumes input + * coefficients were reduced by mld_reduce32(). * - * Arguments: - const mld_poly *a: pointer to polynomial - * - int32_t B: norm bound + * @spec{The definition in FIPS-204 requires signed canonical reduction prior + * to applying the bounds check. However, `-B < (a mod± MLDSA_Q) < B` is + * equivalent to `-B < a < B` under the assumption that + * `B <= MLDSA_Q - MLD_REDUCE32_RANGE_MAX` (cf. the assertion in the code). + * Hence, the present spec and implementation are correct without reduction.} * - * Returns 0 if norm is strictly smaller than - * B <= (MLDSA_Q - MLD_REDUCE32_RANGE_MAX) and 0xFFFFFFFF otherwise. + * @param[in] a Pointer to polynomial. + * @param B Norm bound. * - * Specification: The definition of this FIPS-204 requires signed canonical - * reduction prior to applying the bounds check. - * However, `-B < (a mod± MLDSA_Q) < B` is equivalent to - * `-B < a < B` under the assumption that - * `B <= MLDSA_Q - MLD_REDUCE32_RANGE_MAX` (cf. the assertion in - * the code). Hence, the present spec and implementation are - * correct without reduction. - * - **************************************************/ + * @return 0 if norm is strictly smaller than + * B <= (MLDSA_Q - MLD_REDUCE32_RANGE_MAX) and 0xFFFFFFFF otherwise. + */ MLD_INTERNAL_API +MLD_MUST_CHECK_RETURN_VALUE uint32_t mld_poly_chknorm(const mld_poly *a, int32_t B) __contract__( requires(memory_no_alias(a, sizeof(mld_poly))) diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c index 7a319ca515c..4ce6ac7a6b8 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.c @@ -18,12 +18,10 @@ * https://github.com/pq-crystals/dilithium/tree/master/ref */ -#include -#include +#include "poly_kl.h" #include "ct.h" #include "debug.h" -#include "poly_kl.h" #include "rounding.h" #include "symmetric.h" @@ -39,6 +37,7 @@ /* End of parameter set namespacing */ +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_STATIC_TESTABLE void mld_poly_decompose_c(mld_poly *a1, mld_poly *a0) __contract__( @@ -60,6 +59,7 @@ __contract__( invariant(array_bound(a0->coeffs, i, MLDSA_N, 0, MLDSA_Q)) invariant(array_bound(a1->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) invariant(array_abs_bound(a0->coeffs, 0, i, MLDSA_GAMMA2+1)) + decreases(MLDSA_N - i) ) { mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a0->coeffs[i]); @@ -101,39 +101,17 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0) mld_poly_decompose_c(a1, a0); } -MLD_INTERNAL_API -unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0, - const mld_poly *a1) -{ - unsigned int i, s = 0; +#endif /* !MLD_CONFIG_NO_SIGN_API */ - for (i = 0; i < MLDSA_N; ++i) - __loop__( - invariant(i <= MLDSA_N) - invariant(s <= i) - invariant(array_bound(h->coeffs, 0, i, 0, 2)) - ) - { - const unsigned int hint_bit = mld_make_hint(a0->coeffs[i], a1->coeffs[i]); - h->coeffs[i] = (int32_t)hint_bit; - s += hint_bit; - } - - mld_assert(s <= MLDSA_N); - mld_assert_bound(h->coeffs, MLDSA_N, 0, 2); - return s; -} - -MLD_STATIC_TESTABLE void mld_poly_use_hint_c(mld_poly *b, const mld_poly *a, - const mld_poly *h) +#if !defined(MLD_CONFIG_NO_VERIFY_API) +MLD_STATIC_TESTABLE void mld_poly_use_hint_c(mld_poly *a, const mld_poly *h) __contract__( - requires(memory_no_alias(a, sizeof(mld_poly))) - requires(memory_no_alias(b, sizeof(mld_poly))) + requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(h, sizeof(mld_poly))) requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) requires(array_bound(h->coeffs, 0, MLDSA_N, 0, 2)) - assigns(memory_slice(b, sizeof(mld_poly))) - ensures(array_bound(b->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) ) { unsigned int i; @@ -143,25 +121,27 @@ __contract__( for (i = 0; i < MLDSA_N; ++i) __loop__( invariant(i <= MLDSA_N) - invariant(array_bound(b->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + invariant(array_bound(a->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + invariant(array_bound(a->coeffs, i, MLDSA_N, 0, MLDSA_Q)) + decreases(MLDSA_N - i) ) { - b->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]); + a->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]); } - mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); + mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); } MLD_INTERNAL_API -void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h) +void mld_poly_use_hint(mld_poly *a, const mld_poly *h) { #if defined(MLD_USE_NATIVE_POLY_USE_HINT_88) && MLD_CONFIG_PARAMETER_SET == 44 int ret; mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); mld_assert_bound(h->coeffs, MLDSA_N, 0, 2); - ret = mld_poly_use_hint_88_native(b->coeffs, a->coeffs, h->coeffs); + ret = mld_poly_use_hint_88_native(a->coeffs, h->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { - mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); + mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); return; } #elif defined(MLD_USE_NATIVE_POLY_USE_HINT_32) && \ @@ -169,34 +149,33 @@ void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h) int ret; mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); mld_assert_bound(h->coeffs, MLDSA_N, 0, 2); - ret = mld_poly_use_hint_32_native(b->coeffs, a->coeffs, h->coeffs); + ret = mld_poly_use_hint_32_native(a->coeffs, h->coeffs); if (ret == MLD_NATIVE_FUNC_SUCCESS) { - mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); + mld_assert_bound(a->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); return; } #endif /* !(MLD_USE_NATIVE_POLY_USE_HINT_88 && MLD_CONFIG_PARAMETER_SET == 44) \ && MLD_USE_NATIVE_POLY_USE_HINT_32 && (MLD_CONFIG_PARAMETER_SET == \ 65 || MLD_CONFIG_PARAMETER_SET == 87) */ - mld_poly_use_hint_c(b, a, h); + mld_poly_use_hint_c(a, h); } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Name: mld_rej_eta - * - * Description: Sample uniformly random coefficients in [-MLDSA_ETA, MLDSA_ETA] - *by performing rejection sampling on array of random bytes. +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +/** + * Sample uniformly random coefficients in [-MLDSA_ETA, MLDSA_ETA] by + * performing rejection sampling on an array of random bytes. * - * Arguments: - int32_t *a: pointer to output array (allocated) - * - unsigned int target: requested number of coefficients to - *sample - * - unsigned int offset: number of coefficients already sampled - * - const uint8_t *buf: array of random bytes to sample from - * - unsigned int buflen: length of array of random bytes + * @param[out] a Pointer to output array (allocated). + * @param target Requested number of coefficients to sample. + * @param offset Number of coefficients already sampled. + * @param[in] buf Array of random bytes to sample from. + * @param buflen Length of array of random bytes. * - * Returns number of sampled coefficients. Can be smaller than target if not - *enough random bytes were given. - **************************************************/ + * @return Number of sampled coefficients. Can be smaller than target if not + * enough random bytes were given. + */ /* Reference: `mld_rej_eta()` in the reference implementation @[REF]. * - Our signature differs from the reference implementation @@ -247,6 +226,7 @@ __contract__( __loop__( invariant(offset <= ctr && ctr <= target && pos <= buflen) invariant(array_abs_bound(a, 0, ctr, MLDSA_ETA + 1)) + decreases(buflen - pos) ) { t0 = buf[pos] & 0x0F; @@ -483,12 +463,16 @@ void mld_poly_uniform_eta(mld_poly *r, const uint8_t seed[MLDSA_CRHBYTES], mld_zeroize(extseed, sizeof(extseed)); } #endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) #define MLD_POLY_UNIFORM_GAMMA1_NBLOCKS \ ((MLDSA_POLYZ_PACKEDBYTES + MLD_STREAM256_BLOCKBYTES - 1) / \ MLD_STREAM256_BLOCKBYTES) -#if MLD_CONFIG_PARAMETER_SET == 65 || defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) +#if MLD_CONFIG_PARAMETER_SET == 65 || \ + defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_INTERNAL_API void mld_poly_uniform_gamma1(mld_poly *a, const uint8_t seed[MLDSA_CRHBYTES], uint16_t nonce) @@ -516,10 +500,12 @@ void mld_poly_uniform_gamma1(mld_poly *a, const uint8_t seed[MLDSA_CRHBYTES], mld_zeroize(buf, sizeof(buf)); mld_zeroize(extseed, sizeof(extseed)); } -#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ -#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) +#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) MLD_INTERNAL_API void mld_poly_uniform_gamma1_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2, mld_poly *r3, @@ -568,8 +554,11 @@ void mld_poly_uniform_gamma1_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2, mld_zeroize(buf, sizeof(buf)); mld_zeroize(extseed, sizeof(extseed)); } -#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST) */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) { @@ -591,6 +580,7 @@ void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) __loop__( assigns(i, signs) invariant(i <= 8) + decreases(8 - i) ) { signs |= (uint64_t)buf[i] << 8 * i; @@ -608,8 +598,10 @@ void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) invariant(pos <= SHAKE256_RATE) invariant(array_bound(c->coeffs, 0, MLDSA_N, -1, 2)) invariant(state.pos <= SHAKE256_RATE) + decreases(MLDSA_N - i) ) { + /* This loop teminates only probabilistically, hence no decreases clause. */ do __loop__( assigns(j, object_whole(buf), state, pos) @@ -649,7 +641,9 @@ void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) mld_zeroize(buf, sizeof(buf)); mld_zeroize(&signs, sizeof(signs)); } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API void mld_polyeta_pack(uint8_t r[MLDSA_POLYETA_PACKEDBYTES], const mld_poly *a) { @@ -661,7 +655,8 @@ void mld_polyeta_pack(uint8_t r[MLDSA_POLYETA_PACKEDBYTES], const mld_poly *a) #if MLDSA_ETA == 2 for (i = 0; i < MLDSA_N / 8; ++i) __loop__( - invariant(i <= MLDSA_N/8)) + invariant(i <= MLDSA_N/8) + decreases(MLDSA_N / 8 - i)) { /* The casts are safe since we assume that the coefficients * of a are <= MLDSA_ETA in absolute value. */ @@ -683,7 +678,8 @@ void mld_polyeta_pack(uint8_t r[MLDSA_POLYETA_PACKEDBYTES], const mld_poly *a) #elif MLDSA_ETA == 4 for (i = 0; i < MLDSA_N / 2; ++i) __loop__( - invariant(i <= MLDSA_N/2)) + invariant(i <= MLDSA_N/2) + decreases(MLDSA_N / 2 - i)) { /* The casts are safe since we assume that the coefficients * of a are <= MLDSA_ETA in absolute value. */ @@ -695,7 +691,9 @@ void mld_polyeta_pack(uint8_t r[MLDSA_POLYETA_PACKEDBYTES], const mld_poly *a) #error "Invalid value of MLDSA_ETA" #endif /* MLDSA_ETA != 2 && MLDSA_ETA != 4 */ } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) void mld_polyeta_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYETA_PACKEDBYTES]) { unsigned int i; @@ -704,7 +702,8 @@ void mld_polyeta_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYETA_PACKEDBYTES]) for (i = 0; i < MLDSA_N / 8; ++i) __loop__( invariant(i <= MLDSA_N/8) - invariant(array_bound(r->coeffs, 0, i*8, -5, MLDSA_ETA + 1))) + invariant(array_bound(r->coeffs, 0, i*8, -5, MLDSA_ETA + 1)) + decreases(MLDSA_N / 8 - i)) { r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; @@ -728,7 +727,8 @@ void mld_polyeta_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYETA_PACKEDBYTES]) for (i = 0; i < MLDSA_N / 2; ++i) __loop__( invariant(i <= MLDSA_N/2) - invariant(array_bound(r->coeffs, 0, i*2, -11, MLDSA_ETA + 1))) + invariant(array_bound(r->coeffs, 0, i*2, -11, MLDSA_ETA + 1)) + decreases(MLDSA_N / 2 - i)) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; @@ -742,8 +742,9 @@ void mld_polyeta_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYETA_PACKEDBYTES]) mld_assert_bound(r->coeffs, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1); } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API */ - +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_INTERNAL_API void mld_polyz_pack(uint8_t r[MLDSA_POLYZ_PACKEDBYTES], const mld_poly *a) { @@ -755,7 +756,8 @@ void mld_polyz_pack(uint8_t r[MLDSA_POLYZ_PACKEDBYTES], const mld_poly *a) #if MLD_CONFIG_PARAMETER_SET == 44 for (i = 0; i < MLDSA_N / 4; ++i) __loop__( - invariant(i <= MLDSA_N/4)) + invariant(i <= MLDSA_N/4) + decreases(MLDSA_N / 4 - i)) { /* Safety: a->coeffs[i] <= MLDSA_GAMMA1, hence, these casts are safe. */ t[0] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[4 * i + 0]); @@ -779,7 +781,8 @@ void mld_polyz_pack(uint8_t r[MLDSA_POLYZ_PACKEDBYTES], const mld_poly *a) #else /* MLD_CONFIG_PARAMETER_SET == 44 */ for (i = 0; i < MLDSA_N / 2; ++i) __loop__( - invariant(i <= MLDSA_N/2)) + invariant(i <= MLDSA_N/2) + decreases(MLDSA_N / 2 - i)) { /* Safety: a->coeffs[i] <= MLDSA_GAMMA1, hence, these casts are safe. */ t[0] = (uint32_t)(MLDSA_GAMMA1 - a->coeffs[2 * i + 0]); @@ -794,7 +797,9 @@ void mld_polyz_pack(uint8_t r[MLDSA_POLYZ_PACKEDBYTES], const mld_poly *a) } #endif /* MLD_CONFIG_PARAMETER_SET != 44 */ } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) MLD_STATIC_TESTABLE void mld_polyz_unpack_c( mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES]) __contract__( @@ -809,7 +814,8 @@ __contract__( for (i = 0; i < MLDSA_N / 4; ++i) __loop__( invariant(i <= MLDSA_N/4) - invariant(array_bound(r->coeffs, 0, i*4, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) + invariant(array_bound(r->coeffs, 0, i*4, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) + decreases(MLDSA_N / 4 - i)) { r->coeffs[4 * i + 0] = a[9 * i + 0]; r->coeffs[4 * i + 0] |= (int32_t)a[9 * i + 1] << 8; @@ -840,7 +846,8 @@ __contract__( for (i = 0; i < MLDSA_N / 2; ++i) __loop__( invariant(i <= MLDSA_N/2) - invariant(array_bound(r->coeffs, 0, i*2, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) + invariant(array_bound(r->coeffs, 0, i*2, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) + decreases(MLDSA_N / 2 - i)) { r->coeffs[2 * i + 0] = a[5 * i + 0]; r->coeffs[2 * i + 0] |= (int32_t)a[5 * i + 1] << 8; @@ -897,7 +904,8 @@ void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a) #if MLD_CONFIG_PARAMETER_SET == 44 for (i = 0; i < MLDSA_N / 4; ++i) __loop__( - invariant(i <= MLDSA_N/4)) + invariant(i <= MLDSA_N/4) + decreases(MLDSA_N / 4 - i)) { r[3 * i + 0] = (uint8_t)((a->coeffs[4 * i + 0]) & 0xFF); r[3 * i + 0] |= (uint8_t)((a->coeffs[4 * i + 1] << 6) & 0xFF); @@ -909,13 +917,15 @@ void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a) #else /* MLD_CONFIG_PARAMETER_SET == 44 */ for (i = 0; i < MLDSA_N / 2; ++i) __loop__( - invariant(i <= MLDSA_N/2)) + invariant(i <= MLDSA_N/2) + decreases(MLDSA_N / 2 - i)) { r[i] = (uint8_t)((a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)) & 0xFF); } #endif /* MLD_CONFIG_PARAMETER_SET != 44 */ } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h index c6aa498358f..2e5163ce156 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/poly_kl.h @@ -9,27 +9,23 @@ #include "common.h" #include "poly.h" +#if !defined(MLD_CONFIG_NO_SIGN_API) #define mld_poly_decompose MLD_NAMESPACE_KL(poly_decompose) -/************************************************* - * Name: mld_poly_decompose +/** + * For all coefficients c of the input polynomial, compute high and low bits + * c0, c1 such c mod MLDSA_Q = c1*ALPHA + c0 with -ALPHA/2 < c0 <= ALPHA/2 + * except c1 = (MLDSA_Q-1)/ALPHA where we set c1 = 0 and + * -ALPHA/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. Assumes coefficients to be + * standard representatives. * - * Description: For all coefficients c of the input polynomial, - * compute high and low bits c0, c1 such c mod MLDSA_Q = c1*ALPHA + - * c0 with -ALPHA/2 < c0 <= ALPHA/2 except - * c1 = (MLDSA_Q-1)/ALPHA where we set - * c1 = 0 and -ALPHA/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. - * Assumes coefficients to be standard representatives. + * @reference{The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. Removing + * the aliasing eases CBMC proofs.} * - * Arguments: - mld_poly *a1: pointer to output polynomial with coefficients - * c1 - * - mld_poly *a0: pointer to input/output polynomial. Output - * polynomial has coefficients c0 - * - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - * - **************************************************/ + * @param[out] a1 Pointer to output polynomial with coefficients c1. + * @param[in,out] a0 Pointer to input/output polynomial. Output polynomial has + * coefficients c0. + */ MLD_INTERNAL_API void mld_poly_decompose(mld_poly *a1, mld_poly *a0) __contract__( @@ -42,76 +38,46 @@ __contract__( ensures(array_abs_bound(a0->coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -#define mld_poly_make_hint MLD_NAMESPACE_KL(poly_make_hint) -/************************************************* - * Name: mld_poly_make_hint - * - * Description: Compute hint polynomial. The coefficients of which indicate - * whether the low bits of the corresponding coefficient of - * the input polynomial overflow into the high bits. - * - * Arguments: - mld_poly *h: pointer to output hint polynomial - * - const mld_poly *a0: pointer to low part of input polynomial - * - const mld_poly *a1: pointer to high part of input polynomial - * - * Returns number of 1 bits. - **************************************************/ -MLD_INTERNAL_API -unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0, - const mld_poly *a1) -__contract__( - requires(memory_no_alias(h, sizeof(mld_poly))) - requires(memory_no_alias(a0, sizeof(mld_poly))) - requires(memory_no_alias(a1, sizeof(mld_poly))) - assigns(memory_slice(h, sizeof(mld_poly))) - ensures(return_value <= MLDSA_N) - ensures(array_bound(h->coeffs, 0, MLDSA_N, 0, 2)) -); - +#if !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_poly_use_hint MLD_NAMESPACE_KL(poly_use_hint) -/************************************************* - * Name: mld_poly_use_hint - * - * Description: Use hint polynomial to correct the high bits of a polynomial. +/** + * Use hint polynomial h to correct the high bits of a in-place. * - * Arguments: - mld_poly *b: pointer to output polynomial with corrected high - *bits - * - const mld_poly *a: pointer to input polynomial - * - const mld_poly *h: pointer to input hint polynomial - **************************************************/ + * @param[in,out] a Input/output polynomial. + * @param[in] h Hint polynomial. + */ MLD_INTERNAL_API -void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h) +void mld_poly_use_hint(mld_poly *a, const mld_poly *h) __contract__( - requires(memory_no_alias(a, sizeof(mld_poly))) - requires(memory_no_alias(b, sizeof(mld_poly))) + requires(memory_no_alias(a, sizeof(mld_poly))) requires(memory_no_alias(h, sizeof(mld_poly))) requires(array_bound(a->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) requires(array_bound(h->coeffs, 0, MLDSA_N, 0, 2)) - assigns(memory_slice(b, sizeof(mld_poly))) - ensures(array_bound(b->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + assigns(memory_slice(a, sizeof(mld_poly))) + ensures(array_bound(a->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) #define mld_poly_uniform_eta_4x MLD_NAMESPACE_KL(poly_uniform_eta_4x) -/************************************************* - * Name: mld_poly_uniform_eta +/** + * Sample four polynomials with uniformly random coefficients in + * [-MLDSA_ETA, MLDSA_ETA] by performing rejection sampling on the output + * stream from SHAKE256(seed|nonce_i). * - * Description: Sample four polynomials with uniformly random coefficients - * in [-MLDSA_ETA,MLDSA_ETA] by performing rejection sampling on - * the output stream from SHAKE256(seed|nonce_i) - * - * Arguments: - mld_poly *r0: pointer to first output polynomial - * - mld_poly *r1: pointer to second output polynomial - * - mld_poly *r2: pointer to third output polynomial - * - mld_poly *r3: pointer to fourth output polynomial - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_CRHBYTES - * - uint8_t nonce0: first nonce - * - uint8_t nonce1: second nonce - * - uint8_t nonce2: third nonce - * - uint8_t nonce3: fourth nonce - **************************************************/ + * @param[out] r0 Pointer to first output polynomial. + * @param[out] r1 Pointer to second output polynomial. + * @param[out] r2 Pointer to third output polynomial. + * @param[out] r3 Pointer to fourth output polynomial. + * @param[in] seed Byte array with seed of length MLDSA_CRHBYTES. + * @param nonce0 First nonce. + * @param nonce1 Second nonce. + * @param nonce2 Third nonce. + * @param nonce3 Fourth nonce. + */ MLD_INTERNAL_API void mld_poly_uniform_eta_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2, mld_poly *r3, const uint8_t seed[MLDSA_CRHBYTES], @@ -136,18 +102,15 @@ __contract__( #if defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) #define mld_poly_uniform_eta MLD_NAMESPACE_KL(poly_uniform_eta) -/************************************************* - * Name: mld_poly_uniform_eta +/** + * Sample polynomial with uniformly random coefficients in + * [-MLDSA_ETA, MLDSA_ETA] by performing rejection sampling on the output + * stream from SHAKE256(seed|nonce). * - * Description: Sample polynomial with uniformly random coefficients - * in [-MLDSA_ETA,MLDSA_ETA] by performing rejection sampling on - * the output stream from SHAKE256(seed|nonce) - * - * Arguments: - mld_poly *r: pointer to output polynomial - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_CRHBYTES - * - uint8_t nonce: nonce - **************************************************/ + * @param[out] r Pointer to output polynomial. + * @param[in] seed Byte array with seed of length MLDSA_CRHBYTES. + * @param nonce Nonce. + */ MLD_INTERNAL_API void mld_poly_uniform_eta(mld_poly *r, const uint8_t seed[MLDSA_CRHBYTES], uint8_t nonce) @@ -158,21 +121,22 @@ __contract__( ensures(array_abs_bound(r->coeffs, 0, MLDSA_N, MLDSA_ETA + 1)) ); #endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ -#if MLD_CONFIG_PARAMETER_SET == 65 || defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) +#if !defined(MLD_CONFIG_NO_SIGN_API) +#if MLD_CONFIG_PARAMETER_SET == 65 || \ + defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) || \ + defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_poly_uniform_gamma1 MLD_NAMESPACE_KL(poly_uniform_gamma1) -/************************************************* - * Name: mld_poly_uniform_gamma1 +/** + * Sample polynomial with uniformly random coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output stream of + * SHAKE256(seed|nonce). * - * Description: Sample polynomial with uniformly random coefficients - * in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output - * stream of SHAKE256(seed|nonce) - * - * Arguments: - mld_poly *a: pointer to output polynomial - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_CRHBYTES - * - uint16_t nonce: 16-bit nonce - **************************************************/ + * @param[out] a Pointer to output polynomial. + * @param[in] seed Byte array with seed of length MLDSA_CRHBYTES. + * @param nonce 16-bit nonce. + */ MLD_INTERNAL_API void mld_poly_uniform_gamma1(mld_poly *a, const uint8_t seed[MLDSA_CRHBYTES], uint16_t nonce) @@ -182,22 +146,27 @@ __contract__( assigns(memory_slice(a, sizeof(mld_poly))) ensures(array_bound(a->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) ); -#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_SERIAL_FIPS202_ONLY || \ + MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ -#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) +#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) #define mld_poly_uniform_gamma1_4x MLD_NAMESPACE_KL(poly_uniform_gamma1_4x) -/************************************************* - * Name: mld_poly_uniform_gamma1_4x +/** + * Sample four polynomials with uniformly random coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output streams of + * SHAKE256(seed|nonce_i). * - * Description: Sample polynomial with uniformly random coefficients - * in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output - * stream of SHAKE256(seed|nonce) - * - * Arguments: - mld_poly *a: pointer to output polynomial - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_CRHBYTES - * - uint16_t nonce: 16-bit nonce - **************************************************/ + * @param[out] r0 Pointer to first output polynomial. + * @param[out] r1 Pointer to second output polynomial. + * @param[out] r2 Pointer to third output polynomial. + * @param[out] r3 Pointer to fourth output polynomial. + * @param[in] seed Byte array with seed of length MLDSA_CRHBYTES. + * @param nonce0 First 16-bit nonce. + * @param nonce1 Second 16-bit nonce. + * @param nonce2 Third 16-bit nonce. + * @param nonce3 Fourth 16-bit nonce. + */ MLD_INTERNAL_API void mld_poly_uniform_gamma1_4x(mld_poly *r0, mld_poly *r1, mld_poly *r2, mld_poly *r3, @@ -219,20 +188,19 @@ __contract__( ensures(array_bound(r2->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) ensures(array_bound(r3->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) ); -#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */ +#endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST) */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_poly_challenge MLD_NAMESPACE_KL(poly_challenge) -/************************************************* - * Name: mld_poly_challenge - * - * Description: Implementation of H. Samples polynomial with MLDSA_TAU nonzero - * coefficients in {-1,1} using the output stream of - * SHAKE256(seed). +/** + * Implementation of H. Samples polynomial with MLDSA_TAU nonzero coefficients + * in {-1, 1} using the output stream of SHAKE256(seed). * - * Arguments: - mld_poly *c: pointer to output polynomial - * - const uint8_t mu[]: byte array containing seed of length - * MLDSA_CTILDEBYTES - **************************************************/ + * @param[out] c Pointer to output polynomial. + * @param[in] seed Byte array containing seed of length MLDSA_CTILDEBYTES. + */ MLD_INTERNAL_API void mld_poly_challenge(mld_poly *c, const uint8_t seed[MLDSA_CTILDEBYTES]) __contract__( @@ -242,17 +210,17 @@ __contract__( /* All coefficients of c are -1, 0 or +1 */ ensures(array_bound(c->coeffs, 0, MLDSA_N, -1, 2)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_polyeta_pack MLD_NAMESPACE_KL(polyeta_pack) -/************************************************* - * Name: mld_polyeta_pack +/** + * Bit-pack polynomial with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Description: Bit-pack polynomial with coefficients in [-MLDSA_ETA,MLDSA_ETA]. - * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_POLYETA_PACKEDBYTES bytes - * - const mld_poly *a: pointer to input polynomial - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_POLYETA_PACKEDBYTES bytes. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_polyeta_pack(uint8_t r[MLDSA_POLYETA_PACKEDBYTES], const mld_poly *a) __contract__( @@ -261,12 +229,14 @@ __contract__( requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLDSA_ETA + 1)) assigns(memory_slice(r, MLDSA_POLYETA_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) /* - * polyeta_unpack produces coefficients in [-MLDSA_ETA,MLDSA_ETA] for + * polyeta_unpack produces coefficients in [-MLDSA_ETA, MLDSA_ETA] for * well-formed inputs (i.e., those produced by polyeta_pack). * However, when passed an arbitrary byte array, it may produce smaller values, - * i.e, values in [MLD_POLYETA_UNPACK_LOWER_BOUND,MLDSA_ETA] + * i.e., values in [MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA]. * Even though this should never happen, we use use the bound for arbitrary * inputs in the CBMC proofs. */ @@ -279,14 +249,12 @@ __contract__( #endif #define mld_polyeta_unpack MLD_NAMESPACE_KL(polyeta_unpack) -/************************************************* - * Name: mld_polyeta_unpack +/** + * Unpack polynomial with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Description: Unpack polynomial with coefficients in [-MLDSA_ETA,MLDSA_ETA]. - * - * Arguments: - mld_poly *r: pointer to output polynomial - * - const uint8_t *a: byte array with bit-packed polynomial - **************************************************/ + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ MLD_INTERNAL_API void mld_polyeta_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYETA_PACKEDBYTES]) __contract__( @@ -295,18 +263,18 @@ __contract__( assigns(memory_slice(r, sizeof(mld_poly))) ensures(array_bound(r->coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1)) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) #define mld_polyz_pack MLD_NAMESPACE_KL(polyz_pack) -/************************************************* - * Name: mld_polyz_pack - * - * Description: Bit-pack polynomial with coefficients - * in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. +/** + * Bit-pack polynomial with coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_POLYZ_PACKEDBYTES bytes - * - const mld_poly *a: pointer to input polynomial - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_POLYZ_PACKEDBYTES bytes. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_polyz_pack(uint8_t r[MLDSA_POLYZ_PACKEDBYTES], const mld_poly *a) __contract__( @@ -315,18 +283,17 @@ __contract__( requires(array_bound(a->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) assigns(memory_slice(r, MLDSA_POLYZ_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ - +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_polyz_unpack MLD_NAMESPACE_KL(polyz_unpack) -/************************************************* - * Name: mld_polyz_unpack +/** + * Unpack polynomial z with coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. * - * Description: Unpack polynomial z with coefficients - * in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. - * - * Arguments: - mld_poly *r: pointer to output polynomial - * - const uint8_t *a: byte array with bit-packed polynomial - **************************************************/ + * @param[out] r Pointer to output polynomial. + * @param[in] a Byte array with bit-packed polynomial. + */ MLD_INTERNAL_API void mld_polyz_unpack(mld_poly *r, const uint8_t a[MLDSA_POLYZ_PACKEDBYTES]) __contract__( @@ -337,16 +304,14 @@ __contract__( ); #define mld_polyw1_pack MLD_NAMESPACE_KL(polyw1_pack) -/************************************************* - * Name: mld_polyw1_pack +/** + * Bit-pack polynomial w1 with coefficients in [0, 15] or [0, 43]. Input + * coefficients are assumed to be standard representatives. * - * Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. - * Input coefficients are assumed to be standard representatives. - * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_POLYW1_PACKEDBYTES bytes - * - const mld_poly *a: pointer to input polynomial - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_POLYW1_PACKEDBYTES bytes. + * @param[in] a Pointer to input polynomial. + */ MLD_INTERNAL_API void mld_polyw1_pack(uint8_t r[MLDSA_POLYW1_PACKEDBYTES], const mld_poly *a) __contract__( @@ -355,5 +320,6 @@ __contract__( requires(array_bound(a->coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) assigns(memory_slice(r, MLDSA_POLYW1_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ #endif /* !MLD_POLY_KL_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c index 5c3a08ab324..e02fc07dbf3 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.c @@ -12,241 +12,21 @@ * https://csrc.nist.gov/pubs/fips/204/final */ -#include -#include +#include "polyvec.h" -#include "common.h" #include "debug.h" -#include "poly.h" -#include "poly_kl.h" -#include "polyvec.h" +#include "polyvec_lazy.h" /* This namespacing is not done at the top to avoid a naming conflict * with native backends, which are currently not yet namespaced. */ -#define mld_polymat_permute_bitrev_to_custom \ - MLD_ADD_PARAM_SET(mld_polymat_permute_bitrev_to_custom) -#define mld_polyvecl_permute_bitrev_to_custom \ - MLD_ADD_PARAM_SET(mld_polyvecl_permute_bitrev_to_custom) #define mld_polyvecl_pointwise_acc_montgomery_c \ MLD_ADD_PARAM_SET(mld_polyvecl_pointwise_acc_montgomery_c) -#if !defined(MLD_CONFIG_REDUCE_RAM) -/* Helper function to ensure that the polynomial entries in the output - * of mld_polyvec_matrix_expand use the standard (bitreversed) ordering - * of coefficients. - * No-op unless a native backend with a custom ordering is used. - */ - -static void mld_polyvecl_permute_bitrev_to_custom(mld_polyvecl *v) -__contract__( - /* We don't specify that this should be a permutation, but only - * that it does not change the bound established at the end of - * mld_polyvec_matrix_expand. - */ - requires(memory_no_alias(v, sizeof(mld_polyvecl))) - requires(forall(x, 0, MLDSA_L, - array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) - assigns(memory_slice(v, sizeof(mld_polyvecl))) - ensures(forall(x, 0, MLDSA_L, - array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) -{ -#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER) - unsigned i; - for (i = 0; i < MLDSA_L; i++) - __loop__( - assigns(i, memory_slice(v, sizeof(mld_polyvecl))) - invariant(i <= MLDSA_L) - invariant(forall(x, 0, MLDSA_L, - array_bound(v->vec[x].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - { - mld_poly_permute_bitrev_to_custom(v->vec[i].coeffs); - } -#else /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */ - /* Nothing to do */ - (void)v; -#endif /* !MLD_USE_NATIVE_NTT_CUSTOM_ORDER */ -} - -static void mld_polymat_permute_bitrev_to_custom(mld_polymat *mat) -__contract__( - /* We don't specify that this should be a permutation, but only - * that it does not change the bound established at the end of - * mld_polyvec_matrix_expand. - */ - requires(memory_no_alias(mat, sizeof(mld_polymat))) - requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - assigns(memory_slice(mat, sizeof(mld_polymat))) - ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) -) -{ - unsigned int i; - for (i = 0; i < MLDSA_K; i++) - __loop__( - assigns(i, memory_slice(mat, sizeof(mld_polymat))) - invariant(i <= MLDSA_K) - invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))))) - { - mld_polyvecl_permute_bitrev_to_custom(&mat->vec[i]); - } -} -#endif /* !MLD_CONFIG_REDUCE_RAM */ - -MLD_INTERNAL_API -const mld_polyvecl *mld_polymat_get_row(mld_polymat *mat, unsigned int row) -{ -#if defined(MLD_CONFIG_REDUCE_RAM) - unsigned int i; - MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; - - mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES); - - /* Generate row on-demand */ - for (i = 0; i < MLDSA_L; i++) - { - uint8_t x = (uint8_t)row; - uint8_t y = (uint8_t)i; - - seed_ext[MLDSA_SEEDBYTES + 0] = y; - seed_ext[MLDSA_SEEDBYTES + 1] = x; - - mld_poly_uniform(&mat->row_buffer.vec[i], seed_ext); - -#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER) - mld_poly_permute_bitrev_to_custom(mat->row_buffer.vec[i].coeffs); -#endif - } - - /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - mld_zeroize(seed_ext, sizeof(seed_ext)); - - return &mat->row_buffer; -#else /* MLD_CONFIG_REDUCE_RAM */ - return &mat->vec[row]; -#endif /* !MLD_CONFIG_REDUCE_RAM */ -} - -MLD_INTERNAL_API -void mld_polyvec_matrix_expand(mld_polymat *mat, - const uint8_t rho[MLDSA_SEEDBYTES]) -{ -#if defined(MLD_CONFIG_REDUCE_RAM) - /* In REDUCE_RAM mode, just copy the seed for later on-demand generation */ - mld_memcpy(mat->rho, rho, MLDSA_SEEDBYTES); -#else - unsigned int i, j; - /* - * We generate four separate seed arrays rather than a single one to work - * around limitations in CBMC function contracts dealing with disjoint slices - * of the same parent object. - */ - - MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; - - for (j = 0; j < 4; j++) - __loop__( - assigns(j, object_whole(seed_ext)) - invariant(j <= 4) - ) - { - mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES); - } - -#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) - /* Sample 4 matrix entries a time. */ - for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4) - __loop__( - assigns(i, j, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat))) - invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0) - /* vectors 0 .. i / MLDSA_L are completely sampled */ - invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - /* last vector is sampled up to i % MLDSA_L */ - invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L, - array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - ) - { - for (j = 0; j < 4; j++) - __loop__( - assigns(j, object_whole(seed_ext)) - invariant(j <= 4) - ) - { - uint8_t x = (uint8_t)((i + j) / MLDSA_L); - uint8_t y = (uint8_t)((i + j) % MLDSA_L); - - seed_ext[j][MLDSA_SEEDBYTES + 0] = y; - seed_ext[j][MLDSA_SEEDBYTES + 1] = x; - } - - mld_poly_uniform_4x(&mat->vec[i / MLDSA_L].vec[i % MLDSA_L], - &mat->vec[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L], - &mat->vec[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L], - &mat->vec[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L], - seed_ext); - } -#else /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */ - i = 0; -#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */ - - /* Entries omitted by the batch-sampling are sampled individually. */ - while (i < MLDSA_K * MLDSA_L) - __loop__( - assigns(i, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat))) - invariant(i <= MLDSA_K * MLDSA_L) - /* vectors 0 .. i / MLDSA_L are completely sampled */ - invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - /* last vector is sampled up to i % MLDSA_L */ - invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L, - array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - ) - { - uint8_t x = (uint8_t)(i / MLDSA_L); - uint8_t y = (uint8_t)(i % MLDSA_L); - mld_poly *this_poly = &mat->vec[i / MLDSA_L].vec[i % MLDSA_L]; - - seed_ext[0][MLDSA_SEEDBYTES + 0] = y; - seed_ext[0][MLDSA_SEEDBYTES + 1] = x; - - mld_poly_uniform(this_poly, seed_ext[0]); - i++; - } - - mld_polymat_permute_bitrev_to_custom(mat); - - /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - mld_zeroize(seed_ext, sizeof(seed_ext)); -#endif /* !MLD_CONFIG_REDUCE_RAM */ -} - -MLD_INTERNAL_API -void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t, mld_polymat *mat, - const mld_polyvecl *v) -{ - unsigned int i; - mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(t, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k0, 0, i, - array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q))) - ) - { - const mld_polyvecl *row = mld_polymat_get_row(mat, i); - mld_polyvecl_pointwise_acc_montgomery(&t->vec[i], row, v); - } - - mld_assert_abs_bound_2d(t->vec, MLDSA_K, MLDSA_N, MLDSA_Q); -} - /**************************************************************/ /************ Vectors of polynomials of length MLDSA_L **************/ /**************************************************************/ +#if !defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) MLD_INTERNAL_API void mld_polyvecl_uniform_gamma1(mld_polyvecl *v, const uint8_t seed[MLDSA_CRHBYTES], @@ -289,7 +69,13 @@ void mld_polyvecl_uniform_gamma1(mld_polyvecl *v, mld_assert_bound_2d(v->vec, MLDSA_L, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1); } +#endif /* !MLD_CONFIG_NO_SIGN_API && (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) \ + */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) MLD_INTERNAL_API void mld_polyvecl_ntt(mld_polyvecl *v) { @@ -301,14 +87,19 @@ void mld_polyvecl_ntt(mld_polyvecl *v) assigns(i, memory_slice(v, sizeof(mld_polyvecl))) invariant(i <= MLDSA_L) invariant(forall(k0, i, MLDSA_L, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1]))) - invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))) + invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + decreases(MLDSA_L - i)) { mld_poly_ntt(&v->vec[i]); } mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND); } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API || \ + (!MLD_CONFIG_NO_SIGN_API && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST)) */ +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) MLD_STATIC_TESTABLE void mld_polyvecl_pointwise_acc_montgomery_c( mld_poly *w, const mld_polyvecl *u, const mld_polyvecl *v) __contract__( @@ -331,6 +122,7 @@ __contract__( assigns(i, j, memory_slice(w, sizeof(mld_poly))) invariant(i <= MLDSA_N) invariant(array_abs_bound(w->coeffs, 0, i, MLDSA_Q)) + decreases(MLDSA_N - i) ) { int64_t t = 0; @@ -341,6 +133,7 @@ __contract__( invariant(j <= MLDSA_L) invariant(t >= -(int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1)) invariant(t <= (int64_t)j*(MLDSA_Q - 1)*(MLD_NTT_BOUND - 1)) + decreases(MLDSA_L - j) ) { t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i]; @@ -363,8 +156,8 @@ void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u, mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q); mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND); ret = mld_polyvecl_pointwise_acc_montgomery_l4_native( - w->coeffs, (const int32_t(*)[MLDSA_N])u->vec, - (const int32_t(*)[MLDSA_N])v->vec); + w->coeffs, (const int32_t (*)[MLDSA_N])u->vec, + (const int32_t (*)[MLDSA_N])v->vec); if (ret == MLD_NATIVE_FUNC_SUCCESS) { mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q); @@ -376,8 +169,8 @@ void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u, mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q); mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND); ret = mld_polyvecl_pointwise_acc_montgomery_l5_native( - w->coeffs, (const int32_t(*)[MLDSA_N])u->vec, - (const int32_t(*)[MLDSA_N])v->vec); + w->coeffs, (const int32_t (*)[MLDSA_N])u->vec, + (const int32_t (*)[MLDSA_N])v->vec); if (ret == MLD_NATIVE_FUNC_SUCCESS) { mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q); @@ -389,8 +182,8 @@ void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u, mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, 0, MLDSA_Q); mld_assert_abs_bound_2d(v->vec, MLDSA_L, MLDSA_N, MLD_NTT_BOUND); ret = mld_polyvecl_pointwise_acc_montgomery_l7_native( - w->coeffs, (const int32_t(*)[MLDSA_N])u->vec, - (const int32_t(*)[MLDSA_N])v->vec); + w->coeffs, (const int32_t (*)[MLDSA_N])u->vec, + (const int32_t (*)[MLDSA_N])v->vec); if (ret == MLD_NATIVE_FUNC_SUCCESS) { mld_assert_abs_bound(w->coeffs, MLDSA_N, MLDSA_Q); @@ -402,17 +195,20 @@ void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u, MLD_CONFIG_PARAMETER_SET == 65) && \ MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7 && \ MLD_CONFIG_PARAMETER_SET == 87 */ - /* The first input is bounded by [0, Q-1] inclusive - * The second input is bounded by [-9Q+1, 9Q-1] inclusive . Hence, we can - * safely accumulate in 64-bits without intermediate reductions as - * MLDSA_L * (MLD_NTT_BOUND-1) * (Q-1) < INT64_MAX + /* The first input is bounded by [0, MLDSA_Q-1] inclusive. + * The second input is bounded by [-(9*MLDSA_Q-1), 9*MLDSA_Q-1] inclusive. + * Hence, we can safely accumulate in 64-bits without intermediate reductions + * as MLDSA_L * (MLD_NTT_BOUND-1) * (MLDSA_Q-1) < INT64_MAX. * - * The worst case is ML-DSA-87: 7 * (9Q-1) * (Q-1) < 2**52 - * (and likewise for negative values) + * The worst case is ML-DSA-87: 7 * (9*MLDSA_Q-1) * (MLDSA_Q-1) < 2**52 + * (and likewise for negative values). */ mld_polyvecl_pointwise_acc_montgomery_c(w, u, v); } +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_VERIFY_API) || \ + defined(MLD_UNIT_TEST) MLD_INTERNAL_API uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound) { @@ -426,6 +222,7 @@ uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound) invariant(i <= MLDSA_L) invariant(t == 0 || t == 0xFFFFFFFF) invariant((t == 0) == forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, bound))) + decreases(MLDSA_L - i) ) { /* Reference: Leaks which polynomial violates the bound via a conditional. @@ -436,10 +233,15 @@ uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t bound) } return t; } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API || \ + MLD_UNIT_TEST */ /**************************************************************/ /************ Vectors of polynomials of length MLDSA_K **************/ /**************************************************************/ +#if (!defined(MLD_CONFIG_NO_SIGN_API) && \ + defined(MLD_CONFIG_REDUCE_RAM)) || \ + defined(MLD_UNIT_TEST) MLD_INTERNAL_API void mld_polyveck_reduce(mld_polyveck *v) { @@ -454,6 +256,7 @@ void mld_polyveck_reduce(mld_polyveck *v) invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1]))) invariant(forall(k2, 0, i, array_bound(v->vec[k2].coeffs, 0, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX))) + decreases(MLDSA_K - i) ) { mld_poly_reduce(&v->vec[i]); @@ -462,7 +265,10 @@ void mld_polyveck_reduce(mld_polyveck *v) mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX); } +#endif /* (!MLD_CONFIG_NO_SIGN_API && MLD_CONFIG_REDUCE_RAM) || MLD_UNIT_TEST \ + */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) MLD_INTERNAL_API void mld_polyveck_caddq(mld_polyveck *v) { @@ -474,80 +280,18 @@ void mld_polyveck_caddq(mld_polyveck *v) assigns(i, memory_slice(v, sizeof(mld_polyveck))) invariant(i <= MLDSA_K) invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1]))) - invariant(forall(k1, 0, i, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - { - mld_poly_caddq(&v->vec[i]); - } - - mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); -} - -/* Reference: We use destructive version (output=first input) to avoid - * reasoning about aliasing in the CBMC specification */ -MLD_INTERNAL_API -void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v) -{ - unsigned int i; - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(u, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k0, i, MLDSA_K, - forall(k1, 0, MLDSA_N, u->vec[k0].coeffs[k1] == loop_entry(*u).vec[k0].coeffs[k1]))) - invariant(forall(k6, 0, i, array_bound(u->vec[k6].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX))) - ) - { - mld_poly_add(&u->vec[i], &v->vec[i]); - } - mld_assert_bound_2d(u->vec, MLDSA_L, MLDSA_N, INT32_MIN, - MLD_REDUCE32_DOMAIN_MAX); -} - -MLD_INTERNAL_API -void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v) -{ - unsigned int i; - mld_assert_abs_bound_2d(u->vec, MLDSA_K, MLDSA_N, MLDSA_Q); - mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLDSA_Q); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(u, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k0, 0, i, - array_bound(u->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX))) - invariant(forall(k1, i, MLDSA_K, - forall(n1, 0, MLDSA_N, u->vec[k1].coeffs[n1] == loop_entry(*u).vec[k1].coeffs[n1])))) - { - mld_poly_sub(&u->vec[i], &v->vec[i]); - } - - mld_assert_bound_2d(u->vec, MLDSA_K, MLDSA_N, INT32_MIN, - MLD_REDUCE32_DOMAIN_MAX); -} - -MLD_INTERNAL_API -void mld_polyveck_shiftl(mld_polyveck *v) -{ - unsigned int i; - mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, 1 << 10); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(v, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) invariant(forall(k1, 0, i, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) - invariant(forall(k1, i, MLDSA_K, - forall(n1, 0, MLDSA_N, v->vec[k1].coeffs[n1] == loop_entry(*v).vec[k1].coeffs[n1]))) - ) + decreases(MLDSA_K - i)) { - mld_poly_shiftl(&v->vec[i]); + mld_poly_caddq(&v->vec[i]); } mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); } +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ +#if (!defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST)) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) MLD_INTERNAL_API void mld_polyveck_ntt(mld_polyveck *v) { @@ -559,13 +303,17 @@ void mld_polyveck_ntt(mld_polyveck *v) assigns(i, memory_slice(v, sizeof(mld_polyveck))) invariant(i <= MLDSA_K) invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1]))) - invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)))) + invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + decreases(MLDSA_K - i)) { mld_poly_ntt(&v->vec[i]); } mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_NTT_BOUND); } +#endif /* (!MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST) && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) MLD_INTERNAL_API void mld_polyveck_invntt_tomont(mld_polyveck *v) { @@ -577,33 +325,17 @@ void mld_polyveck_invntt_tomont(mld_polyveck *v) assigns(i, memory_slice(v, sizeof(mld_polyveck))) invariant(i <= MLDSA_K) invariant(forall(k0, i, MLDSA_K, forall(k1, 0, MLDSA_N, v->vec[k0].coeffs[k1] == loop_entry(*v).vec[k0].coeffs[k1]))) - invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND)))) + invariant(forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))) + decreases(MLDSA_K - i)) { mld_poly_invntt_tomont(&v->vec[i]); } mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_INTT_BOUND); } +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ -MLD_INTERNAL_API -void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a, - const mld_polyveck *v) -{ - unsigned int i; - mld_assert_abs_bound_2d(v->vec, MLDSA_K, MLDSA_N, MLD_NTT_BOUND); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(r, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k2, 0, i, array_abs_bound(r->vec[k2].coeffs, 0, MLDSA_N, MLDSA_Q))) - ) - { - mld_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); - } - mld_assert_abs_bound_2d(r->vec, MLDSA_K, MLDSA_N, MLDSA_Q); -} - +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t bound) { @@ -617,6 +349,7 @@ uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t bound) invariant(i <= MLDSA_K) invariant(t == 0 || t == 0xFFFFFFFF) invariant((t == 0) == forall(k1, 0, i, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, bound))) + decreases(MLDSA_K - i) ) { /* Reference: Leaks which polynomial violates the bound via a conditional. @@ -629,30 +362,9 @@ uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t bound) return t; } -MLD_INTERNAL_API -void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0, - const mld_polyveck *v) -{ - unsigned int i; - mld_assert_bound_2d(v->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(v0, sizeof(mld_polyveck)), memory_slice(v1, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k1, 0, i, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1))) - invariant(forall(k2, 0, i, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1))) - ) - { - mld_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); - } - - mld_assert_bound_2d(v0->vec, MLDSA_K, MLDSA_N, -(MLD_2_POW_D / 2) + 1, - (MLD_2_POW_D / 2) + 1); - mld_assert_bound_2d(v1->vec, MLDSA_K, MLDSA_N, 0, - ((MLDSA_Q - 1) / MLD_2_POW_D) + 1); -} +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_INTERNAL_API void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) { @@ -669,6 +381,7 @@ void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) array_abs_bound(v0->vec[k2].coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1))) invariant(forall(k3, i, MLDSA_K, array_bound(v0->vec[k3].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) + decreases(MLDSA_K - i) ) { mld_poly_decompose(&v1->vec[i], &v0->vec[i]); @@ -678,52 +391,9 @@ void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); mld_assert_abs_bound_2d(v0->vec, MLDSA_K, MLDSA_N, MLDSA_GAMMA2 + 1); } +#endif /* !MLD_CONFIG_NO_SIGN_API */ -MLD_INTERNAL_API -unsigned int mld_polyveck_make_hint(mld_polyveck *h, const mld_polyveck *v0, - const mld_polyveck *v1) -{ - unsigned int i, s = 0; - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, s, memory_slice(h, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(s <= i * MLDSA_N) - invariant(forall(k1, 0, i, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - ) - { - s += mld_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); - } - - mld_assert_bound_2d(h->vec, MLDSA_K, MLDSA_N, 0, 2); - return s; -} - -MLD_INTERNAL_API -void mld_polyveck_use_hint(mld_polyveck *w, const mld_polyveck *u, - const mld_polyveck *h) -{ - unsigned int i; - mld_assert_bound_2d(u->vec, MLDSA_K, MLDSA_N, 0, MLDSA_Q); - mld_assert_bound_2d(h->vec, MLDSA_K, MLDSA_N, 0, 2); - - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(w, sizeof(mld_polyveck))) - invariant(i <= MLDSA_K) - invariant(forall(k2, 0, i, - array_bound(w->vec[k2].coeffs, 0, MLDSA_N, 0, - (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)))) - ) - { - mld_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); - } - - mld_assert_bound_2d(w->vec, MLDSA_K, MLDSA_N, 0, - (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); -} - +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_INTERNAL_API void mld_polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES], const mld_polyveck *w1) @@ -736,12 +406,15 @@ void mld_polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES], __loop__( assigns(i, memory_slice(r, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES)) invariant(i <= MLDSA_K) + decreases(MLDSA_K - i) ) { mld_polyw1_pack(&r[i * MLDSA_POLYW1_PACKEDBYTES], &w1->vec[i]); } } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_INTERNAL_API void mld_polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES], const mld_polyveck *p) @@ -752,6 +425,7 @@ void mld_polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES], __loop__( assigns(i, memory_slice(r, MLDSA_K * MLDSA_POLYETA_PACKEDBYTES)) invariant(i <= MLDSA_K) + decreases(MLDSA_K - i) ) { mld_polyeta_pack(&r[i * MLDSA_POLYETA_PACKEDBYTES], &p->vec[i]); @@ -768,29 +442,18 @@ void mld_polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES], __loop__( assigns(i, memory_slice(r, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) invariant(i <= MLDSA_L) + decreases(MLDSA_L - i) ) { mld_polyeta_pack(&r[i * MLDSA_POLYETA_PACKEDBYTES], &p->vec[i]); } } -MLD_INTERNAL_API -void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES], - const mld_polyveck *p) -{ - unsigned int i; - mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1, - (1 << (MLDSA_D - 1)) + 1); - for (i = 0; i < MLDSA_K; ++i) - __loop__( - assigns(i, memory_slice(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) - invariant(i <= MLDSA_K) - ) - { - mld_polyt0_pack(&r[i * MLDSA_POLYT0_PACKEDBYTES], &p->vec[i]); - } -} +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) MLD_INTERNAL_API void mld_polyvecl_unpack_eta( mld_polyvecl *p, const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES]) @@ -804,7 +467,10 @@ void mld_polyvecl_unpack_eta( mld_assert_bound_2d(p->vec, MLDSA_L, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1); } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || (!MLD_CONFIG_NO_SIGN_API && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST)) */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_INTERNAL_API void mld_polyvecl_unpack_z(mld_polyvecl *z, const uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES]) @@ -818,7 +484,11 @@ void mld_polyvecl_unpack_z(mld_polyvecl *z, mld_assert_bound_2d(z->vec, MLDSA_L, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1); } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) MLD_INTERNAL_API void mld_polyveck_unpack_eta( mld_polyveck *p, const uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES]) @@ -832,23 +502,9 @@ void mld_polyveck_unpack_eta( mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1); } - -MLD_INTERNAL_API -void mld_polyveck_unpack_t0(mld_polyveck *p, - const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES]) -{ - unsigned int i; - for (i = 0; i < MLDSA_K; ++i) - { - mld_polyt0_unpack(&p->vec[i], r + i * MLDSA_POLYT0_PACKEDBYTES); - } - - mld_assert_bound_2d(p->vec, MLDSA_K, MLDSA_N, -(1 << (MLDSA_D - 1)) + 1, - (1 << (MLDSA_D - 1)) + 1); -} +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || (!MLD_CONFIG_NO_SIGN_API && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST)) */ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef mld_polymat_permute_bitrev_to_custom -#undef mld_polyvecl_permute_bitrev_to_custom #undef mld_polyvecl_pointwise_acc_montgomery_c diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h index ee97508b7a0..68cc71f7530 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec.h @@ -5,7 +5,6 @@ #ifndef MLD_POLYVEC_H #define MLD_POLYVEC_H -#include #include "cbmc.h" #include "common.h" #include "poly.h" @@ -17,29 +16,27 @@ * within a single compilation unit. */ #define mld_polyvecl MLD_ADD_PARAM_SET(mld_polyvecl) #define mld_polyveck MLD_ADD_PARAM_SET(mld_polyveck) -#define mld_polymat MLD_ADD_PARAM_SET(mld_polymat) /* End of parameter set namespacing */ -/* Vectors of polynomials of length MLDSA_L */ +/** Vector of MLDSA_L polynomials. */ typedef struct { - mld_poly vec[MLDSA_L]; + mld_poly vec[MLDSA_L]; /**< Component polynomials. */ } mld_polyvecl; +#if !defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) #define mld_polyvecl_uniform_gamma1 MLD_NAMESPACE_KL(polyvecl_uniform_gamma1) -/************************************************* - * Name: mld_polyvecl_uniform_gamma1 - * - * Description: Sample vector of polynomials with uniformly random coefficients - * in [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output - * stream of SHAKE256(seed|nonce) - * - * Arguments: - mld_polyvecl *v: pointer to output vector - * - const uint8_t seed[]: byte array with seed of length - * MLDSA_CRHBYTES - * - uint16_t nonce: 16-bit nonce - *************************************************/ +/** + * Sample vector of polynomials with uniformly random coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1] by unpacking output stream of + * SHAKE256(seed|nonce). + * + * @param[out] v Pointer to output vector. + * @param[in] seed Byte array with seed of length MLDSA_CRHBYTES. + * @param nonce 16-bit nonce. + */ MLD_INTERNAL_API void mld_polyvecl_uniform_gamma1(mld_polyvecl *v, const uint8_t seed[MLDSA_CRHBYTES], @@ -52,16 +49,20 @@ __contract__( ensures(forall(k0, 0, MLDSA_L, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) ); +#endif /* !MLD_CONFIG_NO_SIGN_API && (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) \ + */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) #define mld_polyvecl_ntt MLD_NAMESPACE_KL(polyvecl_ntt) -/************************************************* - * Name: mld_polyvecl_ntt +/** + * Forward NTT of all polynomials in vector of length MLDSA_L. Coefficients + * can grow by 8*MLDSA_Q in absolute value. * - * Description: Forward NTT of all polynomials in vector of length MLDSA_L. - * Coefficients can grow by 8*MLDSA_Q in absolute value. - * - * Arguments: - mld_polyvecl *v: pointer to input/output vector - **************************************************/ + * @param[in,out] v Pointer to input/output vector. + */ MLD_INTERNAL_API void mld_polyvecl_ntt(mld_polyvecl *v) __contract__( @@ -70,30 +71,28 @@ __contract__( assigns(memory_slice(v, sizeof(mld_polyvecl))) ensures(forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API || \ + (!MLD_CONFIG_NO_SIGN_API && (!MLD_CONFIG_REDUCE_RAM || \ + MLD_UNIT_TEST)) */ +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) #define mld_polyvecl_pointwise_acc_montgomery \ MLD_NAMESPACE_KL(polyvecl_pointwise_acc_montgomery) -/************************************************* - * Name: mld_polyvecl_pointwise_acc_montgomery +/** + * Pointwise multiply vectors of polynomials of length MLDSA_L, multiply + * resulting vector by 2^{-32} and add (accumulate) polynomials in it. + * Input/output vectors are in NTT domain representation. * - * Description: Pointwise multiply vectors of polynomials of length MLDSA_L, - * multiply resulting vector by 2^{-32} and add (accumulate) - * polynomials in it. - * Input/output vectors are in NTT domain representation. + * The first input "u" must be the output of polyvec_matrix_expand() and so + * have coefficients in [0, MLDSA_Q-1] inclusive. * - * The first input "u" must be the output of - * polyvec_matrix_expand() and so have coefficients in [0, Q-1] - * inclusive. + * The second input "v" is assumed to be output of an NTT, and hence must have + * coefficients bounded by [-(9*MLDSA_Q-1), 9*MLDSA_Q-1] inclusive. * - * The second input "v" is assumed to be output of an NTT, and - * hence must have coefficients bounded by [-9q+1, +9q-1] - * inclusive. - * - * - * Arguments: - mld_poly *w: output polynomial - * - const mld_polyvecl *u: pointer to first input vector - * - const mld_polyvecl *v: pointer to second input vector - **************************************************/ + * @param[out] w Output polynomial. + * @param[in] u Pointer to first input vector. + * @param[in] v Pointer to second input vector. + */ MLD_INTERNAL_API void mld_polyvecl_pointwise_acc_montgomery(mld_poly *w, const mld_polyvecl *u, const mld_polyvecl *v) @@ -108,23 +107,22 @@ __contract__( assigns(memory_slice(w, sizeof(mld_poly))) ensures(array_abs_bound(w->coeffs, 0, MLDSA_N, MLDSA_Q)) ); +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ - +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_polyvecl_chknorm MLD_NAMESPACE_KL(polyvecl_chknorm) -/************************************************* - * Name: mld_polyvecl_chknorm +/** + * Check infinity norm of polynomials in vector of length MLDSA_L. Assumes + * input mld_polyvecl to be reduced by polyvecl_reduce(). * - * Description: Check infinity norm of polynomials in vector of length MLDSA_L. - * Assumes input mld_polyvecl to be reduced by polyvecl_reduce(). + * @param[in] v Pointer to vector. + * @param B Norm bound. * - * Arguments: - const mld_polyvecl *v: pointer to vector - * - int32_t B: norm bound - * - * Returns 0 if norm of all polynomials is strictly smaller than B <= - * (MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise. - **************************************************/ -MLD_MUST_CHECK_RETURN_VALUE + * @return 0 if norm of all polynomials is strictly smaller than + * B <= (MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise. + */ MLD_INTERNAL_API +MLD_MUST_CHECK_RETURN_VALUE uint32_t mld_polyvecl_chknorm(const mld_polyvecl *v, int32_t B) __contract__( requires(memory_no_alias(v, sizeof(mld_polyvecl))) @@ -134,34 +132,23 @@ __contract__( ensures(return_value == 0 || return_value == 0xFFFFFFFF) ensures((return_value == 0) == forall(k1, 0, MLDSA_L, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, B))) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API */ -/* Vectors of polynomials of length MLDSA_K */ +/** Vector of MLDSA_K polynomials. */ typedef struct { - mld_poly vec[MLDSA_K]; + mld_poly vec[MLDSA_K]; /**< Component polynomials. */ } mld_polyveck; -/* Matrix of polynomials (K x L) */ -typedef struct -{ -#if defined(MLD_CONFIG_REDUCE_RAM) - mld_polyvecl row_buffer; - uint8_t rho[MLDSA_SEEDBYTES]; -#else - mld_polyvecl vec[MLDSA_K]; -#endif -} mld_polymat; - +#if (!defined(MLD_CONFIG_NO_SIGN_API) && defined(MLD_CONFIG_REDUCE_RAM)) || \ + defined(MLD_UNIT_TEST) #define mld_polyveck_reduce MLD_NAMESPACE_KL(polyveck_reduce) -/************************************************* - * Name: polyveck_reduce +/** + * Reduce coefficients of polynomials in vector of length MLDSA_K to + * representatives in [-MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX]. * - * Description: Reduce coefficients of polynomials in vector of length MLDSA_K - * to representatives in - *[-MLD_REDUCE32_RANGE_MAX,MLD_REDUCE32_RANGE_MAX]. - * - * Arguments: - mld_polyveck *v: pointer to input/output vector - **************************************************/ + * @param[in,out] v Pointer to input/output vector. + */ MLD_INTERNAL_API void mld_polyveck_reduce(mld_polyveck *v) __contract__( @@ -172,16 +159,17 @@ __contract__( ensures(forall(k1, 0, MLDSA_K, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX))) ); +#endif /* (!MLD_CONFIG_NO_SIGN_API && MLD_CONFIG_REDUCE_RAM) || MLD_UNIT_TEST \ + */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) #define mld_polyveck_caddq MLD_NAMESPACE_KL(polyveck_caddq) -/************************************************* - * Name: mld_polyveck_caddq +/** + * For all coefficients of polynomials in vector of length MLDSA_K add MLDSA_Q + * if coefficient is negative. * - * Description: For all coefficients of polynomials in vector of length MLDSA_K - * add MLDSA_Q if coefficient is negative. - * - * Arguments: - mld_polyveck *v: pointer to input/output vector - **************************************************/ + * @param[in,out] v Pointer to input/output vector. + */ MLD_INTERNAL_API void mld_polyveck_caddq(mld_polyveck *v) __contract__( @@ -192,83 +180,17 @@ __contract__( ensures(forall(k1, 0, MLDSA_K, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ -#define mld_polyveck_add MLD_NAMESPACE_KL(polyveck_add) -/************************************************* - * Name: mld_polyveck_add - * - * Description: Add vectors of polynomials of length MLDSA_K. - * No modular reduction is performed. - * - * Arguments: - mld_polyveck *u: pointer to input-output vector of polynomials - * to be added to - * - const mld_polyveck *v: pointer to second input vector of - * polynomials - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_add(mld_polyveck *u, const mld_polyveck *v) -__contract__( - requires(memory_no_alias(u, sizeof(mld_polyveck))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(forall(p0, 0, MLDSA_K, array_abs_bound(u->vec[p0].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))) - requires(forall(p1, 0, MLDSA_K, - array_bound(v->vec[p1].coeffs, 0, MLDSA_N, -MLD_REDUCE32_RANGE_MAX, MLD_REDUCE32_RANGE_MAX))) - assigns(memory_slice(u, sizeof(mld_polyveck))) - ensures(forall(q2, 0, MLDSA_K, - array_bound(u->vec[q2].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX))) -); - -#define mld_polyveck_sub MLD_NAMESPACE_KL(polyveck_sub) -/************************************************* - * Name: mld_polyveck_sub - * - * Description: Subtract vectors of polynomials of length MLDSA_K. - * No modular reduction is performed. - * - * Arguments: - mld_polyveck *u: pointer to first input vector - * - const mld_polyveck *v: pointer to second input vector to be - * subtracted from first input vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_sub(mld_polyveck *u, const mld_polyveck *v) -__contract__( - requires(memory_no_alias(u, sizeof(mld_polyveck))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, array_abs_bound(u->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q))) - requires(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q))) - assigns(memory_slice(u, sizeof(mld_polyveck))) - ensures(forall(k0, 0, MLDSA_K, - array_bound(u->vec[k0].coeffs, 0, MLDSA_N, INT32_MIN, MLD_REDUCE32_DOMAIN_MAX))) -); - -#define mld_polyveck_shiftl MLD_NAMESPACE_KL(polyveck_shiftl) -/************************************************* - * Name: mld_polyveck_shiftl - * - * Description: Multiply vector of polynomials of Length MLDSA_K by 2^MLDSA_D - *without modular reduction. Assumes input coefficients to be less than - *2^{31-MLDSA_D}. - * - * Arguments: - mld_polyveck *v: pointer to input/output vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_shiftl(mld_polyveck *v) -__contract__( - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, 1 << 10))) - assigns(memory_slice(v, sizeof(mld_polyveck))) - ensures(forall(k1, 0, MLDSA_K, array_bound(v->vec[k1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) -); - +#if (!defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST)) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) #define mld_polyveck_ntt MLD_NAMESPACE_KL(polyveck_ntt) -/************************************************* - * Name: mld_polyveck_ntt - * - * Description: Forward NTT of all polynomials in vector of length MLDSA_K. - * Coefficients can grow by 8*MLDSA_Q in absolute value. +/** + * Forward NTT of all polynomials in vector of length MLDSA_K. Coefficients + * can grow by 8*MLDSA_Q in absolute value. * - * Arguments: - mld_polyveck *v: pointer to input/output vector - **************************************************/ + * @param[in,out] v Pointer to input/output vector. + */ MLD_INTERNAL_API void mld_polyveck_ntt(mld_polyveck *v) __contract__( @@ -277,17 +199,20 @@ __contract__( assigns(memory_slice(v, sizeof(mld_polyveck))) ensures(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) ); +#endif /* (!MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST) && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST) #define mld_polyveck_invntt_tomont MLD_NAMESPACE_KL(polyveck_invntt_tomont) -/************************************************* - * Name: mld_polyveck_invntt_tomont +/** + * Inverse NTT and multiplication by 2^{32} of polynomials in vector of + * length MLDSA_K. + * + * Input coefficients need to be less than MLDSA_Q, and output coefficients + * are bounded by MLD_INTT_BOUND. * - * Description: Inverse NTT and multiplication by 2^{32} of polynomials - * in vector of length MLDSA_K. - * Input coefficients need to be less than MLDSA_Q, and - * Output coefficients are bounded by MLD_INTT_BOUND. - * Arguments: - mld_polyveck *v: pointer to input/output vector - **************************************************/ + * @param[in,out] v Pointer to input/output vector. + */ MLD_INTERNAL_API void mld_polyveck_invntt_tomont(mld_polyveck *v) __contract__( @@ -296,48 +221,22 @@ __contract__( assigns(memory_slice(v, sizeof(mld_polyveck))) ensures(forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST */ -#define mld_polyveck_pointwise_poly_montgomery \ - MLD_NAMESPACE_KL(polyveck_pointwise_poly_montgomery) -/************************************************* - * Name: mld_polyveck_pointwise_poly_montgomery - * - * Description: Pointwise multiplication of a polynomial vector of length - * MLDSA_K by a single polynomial in NTT domain and multiplication - * of the resulting polynomial vector by 2^{-32}. - * - * Arguments: - mld_polyveck *r: pointer to output vector - * - mld_poly *a: pointer to input polynomial - * - mld_polyveck *v: pointer to input vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_pointwise_poly_montgomery(mld_polyveck *r, const mld_poly *a, - const mld_polyveck *v) -__contract__( - requires(memory_no_alias(r, sizeof(mld_polyveck))) - requires(memory_no_alias(a, sizeof(mld_poly))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(array_abs_bound(a->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - requires(forall(k0, 0, MLDSA_K, array_abs_bound(v->vec[k0].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - assigns(memory_slice(r, sizeof(mld_polyveck))) - ensures(forall(k1, 0, MLDSA_K, array_abs_bound(r->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q))) -); - +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_polyveck_chknorm MLD_NAMESPACE_KL(polyveck_chknorm) -/************************************************* - * Name: mld_polyveck_chknorm - * - * Description: Check infinity norm of polynomials in vector of length MLDSA_K. - * Assumes input mld_polyveck to be reduced by polyveck_reduce(). +/** + * Check infinity norm of polynomials in vector of length MLDSA_K. Assumes + * input mld_polyveck to be reduced by polyveck_reduce(). * - * Arguments: - const mld_polyveck *v: pointer to vector - * - int32_t B: norm bound + * @param[in] v Pointer to vector. + * @param B Norm bound. * - * Returns 0 if norm of all polynomials are strictly smaller than B <= - *(MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise. - **************************************************/ -MLD_MUST_CHECK_RETURN_VALUE + * @return 0 if norm of all polynomials are strictly smaller than + * B <= (MLDSA_Q-1)/8 and 0xFFFFFFFF otherwise. + */ MLD_INTERNAL_API +MLD_MUST_CHECK_RETURN_VALUE uint32_t mld_polyveck_chknorm(const mld_polyveck *v, int32_t B) __contract__( requires(memory_no_alias(v, sizeof(mld_polyveck))) @@ -349,56 +248,26 @@ __contract__( ensures((return_value == 0) == forall(k1, 0, MLDSA_K, array_abs_bound(v->vec[k1].coeffs, 0, MLDSA_N, B))) ); -#define mld_polyveck_power2round MLD_NAMESPACE_KL(polyveck_power2round) -/************************************************* - * Name: mld_polyveck_power2round - * - * Description: For all coefficients a of polynomials in vector of length - *MLDSA_K, compute a0, a1 such that a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with - *-2^{MLDSA_D-1} < a0 <= 2^{MLDSA_D-1}. Assumes coefficients to be standard - *representatives. - * - * Arguments: - mld_polyveck *v1: pointer to output vector of polynomials with - * coefficients a1 - * - mld_polyveck *v0: pointer to output vector of polynomials with - * coefficients a0 - * - const mld_polyveck *v: pointer to input vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_power2round(mld_polyveck *v1, mld_polyveck *v0, - const mld_polyveck *v) -__contract__( - requires(memory_no_alias(v1, sizeof(mld_polyveck))) - requires(memory_no_alias(v0, sizeof(mld_polyveck))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) - assigns(memory_slice(v1, sizeof(mld_polyveck))) - assigns(memory_slice(v0, sizeof(mld_polyveck))) - ensures(forall(k1, 0, MLDSA_K, array_bound(v0->vec[k1].coeffs, 0, MLDSA_N, -(MLD_2_POW_D/2)+1, (MLD_2_POW_D/2)+1))) - ensures(forall(k2, 0, MLDSA_K, array_bound(v1->vec[k2].coeffs, 0, MLDSA_N, 0, ((MLDSA_Q - 1) / MLD_2_POW_D) + 1))) -); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) #define mld_polyveck_decompose MLD_NAMESPACE_KL(polyveck_decompose) -/************************************************* - * Name: mld_polyveck_decompose - * - * Description: For all coefficients a of polynomials in vector of length - * MLDSA_K, compute high and low bits a0, a1 such a mod^+ MLDSA_Q = a1*ALPHA - * + a0 with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (MLDSA_Q-1)/ALPHA where we set - * a1 = 0 and -ALPHA/2 <= a0 = a mod MLDSA_Q - MLDSA_Q < 0. Assumes coefficients - * to be standard representatives. - * - * Arguments: - mld_polyveck *v1: pointer to output vector of polynomials with - * coefficients a1 - * - mld_polyveck *v0: pointer to input/output vector of - * polynomials with. Output polynomial has - * coefficients a0 - * - * Reference: The reference implementation has the input polynomial as a - * separate argument that may be aliased with either of the outputs. - * Removing the aliasing eases CBMC proofs. - * - **************************************************/ +/** + * For all coefficients a of polynomials in vector of length MLDSA_K, compute + * high and low bits a0, a1 such a mod^+ MLDSA_Q = a1*ALPHA + a0 with + * -ALPHA/2 < a0 <= ALPHA/2 except a1 = (MLDSA_Q-1)/ALPHA where we set + * a1 = 0 and -ALPHA/2 <= a0 = a mod MLDSA_Q - MLDSA_Q < 0. Assumes + * coefficients to be standard representatives. + * + * @reference{The reference implementation has the input polynomial as a + * separate argument that may be aliased with either of the outputs. Removing + * the aliasing eases CBMC proofs.} + * + * @param[out] v1 Pointer to output vector of polynomials with + * coefficients a1. + * @param[in,out] v0 Pointer to input/output vector of polynomials. Output + * polynomial has coefficients a0. + */ MLD_INTERNAL_API void mld_polyveck_decompose(mld_polyveck *v1, mld_polyveck *v0) __contract__( @@ -413,71 +282,18 @@ __contract__( ensures(forall(k2, 0, MLDSA_K, array_abs_bound(v0->vec[k2].coeffs, 0, MLDSA_N, MLDSA_GAMMA2+1))) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -#define mld_polyveck_make_hint MLD_NAMESPACE_KL(polyveck_make_hint) -/************************************************* - * Name: mld_polyveck_make_hint - * - * Description: Compute hint vector. - * - * Arguments: - mld_polyveck *h: pointer to output vector - * - const mld_polyveck *v0: pointer to low part of input vector - * - const mld_polyveck *v1: pointer to high part of input vector - * - * Returns number of 1 bits. - **************************************************/ -MLD_MUST_CHECK_RETURN_VALUE -MLD_INTERNAL_API -unsigned int mld_polyveck_make_hint(mld_polyveck *h, const mld_polyveck *v0, - const mld_polyveck *v1) -__contract__( - requires(memory_no_alias(h, sizeof(mld_polyveck))) - requires(memory_no_alias(v0, sizeof(mld_polyveck))) - requires(memory_no_alias(v1, sizeof(mld_polyveck))) - assigns(memory_slice(h, sizeof(mld_polyveck))) - ensures(return_value <= MLDSA_N * MLDSA_K) - ensures(forall(k1, 0, MLDSA_K, array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) -); - -#define mld_polyveck_use_hint MLD_NAMESPACE_KL(polyveck_use_hint) -/************************************************* - * Name: mld_polyveck_use_hint - * - * Description: Use hint vector to correct the high bits of input vector. - * - * Arguments: - mld_polyveck *w: pointer to output vector of polynomials with - * corrected high bits - * - const mld_polyveck *u: pointer to input vector - * - const mld_polyveck *h: pointer to input hint vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_use_hint(mld_polyveck *w, const mld_polyveck *v, - const mld_polyveck *h) -__contract__( - requires(memory_no_alias(w, sizeof(mld_polyveck))) - requires(memory_no_alias(v, sizeof(mld_polyveck))) - requires(memory_no_alias(h, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, - array_bound(v->vec[k0].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) - requires(forall(k1, 0, MLDSA_K, - array_bound(h->vec[k1].coeffs, 0, MLDSA_N, 0, 2))) - assigns(memory_slice(w, sizeof(mld_polyveck))) - ensures(forall(k2, 0, MLDSA_K, - array_bound(w->vec[k2].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))) -); - +#if !defined(MLD_CONFIG_NO_SIGN_API) #define mld_polyveck_pack_w1 MLD_NAMESPACE_KL(polyveck_pack_w1) -/************************************************* - * Name: mld_polyveck_pack_w1 - * - * Description: Bit-pack polynomial vector w1 with coefficients in [0,15] or - * [0,43]. - * Input coefficients are assumed to be standard representatives. +/** + * Bit-pack polynomial vector w1 with coefficients in [0, 15] or [0, 43]. Input + * coefficients are assumed to be standard representatives. * - * Arguments: - uint8_t *r: pointer to output byte array with at least - * MLDSA_K* MLDSA_POLYW1_PACKEDBYTES bytes - * - const mld_polyveck *a: pointer to input polynomial vector - **************************************************/ + * @param[out] r Pointer to output byte array with at least + * MLDSA_K * MLDSA_POLYW1_PACKEDBYTES bytes. + * @param[in] w1 Pointer to input polynomial vector. + */ MLD_INTERNAL_API void mld_polyveck_pack_w1(uint8_t r[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES], const mld_polyveck *w1) @@ -488,18 +304,17 @@ __contract__( array_bound(w1->vec[k1].coeffs, 0, MLDSA_N, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))) assigns(memory_slice(r, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) #define mld_polyveck_pack_eta MLD_NAMESPACE_KL(polyveck_pack_eta) -/************************************************* - * Name: mld_polyveck_pack_eta +/** + * Bit-pack polynomial vector with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Description: Bit-pack polynomial vector with coefficients - * in [-MLDSA_ETA,MLDSA_ETA]. - * - * Arguments: - uint8_t *r: pointer to output byte array with - * MLDSA_K * MLDSA_POLYETA_PACKEDBYTES bytes - * - const polyveck *p: pointer to input polynomial vector - **************************************************/ + * @param[out] r Pointer to output byte array with + * MLDSA_K * MLDSA_POLYETA_PACKEDBYTES bytes. + * @param[in] p Pointer to input polynomial vector. + */ MLD_INTERNAL_API void mld_polyveck_pack_eta(uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES], const mld_polyveck *p) @@ -512,16 +327,13 @@ __contract__( ); #define mld_polyvecl_pack_eta MLD_NAMESPACE_KL(polyvecl_pack_eta) -/************************************************* - * Name: mld_polyvecl_pack_eta +/** + * Bit-pack polynomial vector with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Description: Bit-pack polynomial vector with coefficients in - * [-MLDSA_ETA,MLDSA_ETA]. - * - * Arguments: - uint8_t *r: pointer to output byte array with - * MLDSA_L * MLDSA_POLYETA_PACKEDBYTES bytes - * - const polyveck *p: pointer to input polynomial vector - **************************************************/ + * @param[out] r Pointer to output byte array with + * MLDSA_L * MLDSA_POLYETA_PACKEDBYTES bytes. + * @param[in] p Pointer to input polynomial vector. + */ MLD_INTERNAL_API void mld_polyvecl_pack_eta(uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES], const mld_polyvecl *p) @@ -533,39 +345,18 @@ __contract__( assigns(memory_slice(r, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) ); -#define mld_polyveck_pack_t0 MLD_NAMESPACE_KL(polyveck_pack_t0) -/************************************************* - * Name: mld_polyveck_pack_t0 - * - * Description: Bit-pack polynomial vector to with coefficients in - * ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}]. - * - * Arguments: - uint8_t *r: pointer to output byte array with - * MLDSA_K * MLDSA_POLYT0_PACKEDBYTES bytes - * - const mld_poly *p: pointer to input polynomial vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_pack_t0(uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES], - const mld_polyveck *p) -__contract__( - requires(memory_no_alias(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) - requires(memory_no_alias(p, sizeof(mld_polyveck))) - requires(forall(k0, 0, MLDSA_K, - array_bound(p->vec[k0].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) - assigns(memory_slice(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) -); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) #define mld_polyvecl_unpack_eta MLD_NAMESPACE_KL(polyvecl_unpack_eta) -/************************************************* - * Name: mld_polyvecl_unpack_eta - * - * Description: Unpack polynomial vector with coefficients in - * [-MLDSA_ETA,MLDSA_ETA]. +/** + * Unpack polynomial vector with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Arguments: - mld_polyvecl *p: pointer to output polynomial vector - * - const uint8_t *r: input byte array with - * bit-packed polynomial vector - **************************************************/ + * @param[out] p Pointer to output polynomial vector. + * @param[in] r Input byte array with bit-packed polynomial vector. + */ MLD_INTERNAL_API void mld_polyvecl_unpack_eta( mld_polyvecl *p, const uint8_t r[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES]) @@ -576,18 +367,18 @@ __contract__( ensures(forall(k1, 0, MLDSA_L, array_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || (!MLD_CONFIG_NO_SIGN_API && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST)) */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) #define mld_polyvecl_unpack_z MLD_NAMESPACE_KL(polyvecl_unpack_z) -/************************************************* - * Name: mld_polyvecl_unpack_z - * - * Description: Unpack polynomial vector with coefficients in - * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. +/** + * Unpack polynomial vector with coefficients in + * [-(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1]. * - * Arguments: - mld_polyvecl *z: pointer to output polynomial vector - * - const uint8_t *r: input byte array with - * bit-packed polynomial vector - **************************************************/ + * @param[out] z Pointer to output polynomial vector. + * @param[in] r Input byte array with bit-packed polynomial vector. + */ MLD_INTERNAL_API void mld_polyvecl_unpack_z(mld_polyvecl *z, const uint8_t r[MLDSA_L * MLDSA_POLYZ_PACKEDBYTES]) @@ -598,18 +389,18 @@ __contract__( ensures(forall(k1, 0, MLDSA_L, array_bound(z->vec[k1].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || \ + (!defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST))) #define mld_polyveck_unpack_eta MLD_NAMESPACE_KL(polyveck_unpack_eta) -/************************************************* - * Name: mld_polyveck_unpack_eta - * - * Description: Unpack polynomial vector with coefficients in - * [-MLDSA_ETA,MLDSA_ETA]. +/** + * Unpack polynomial vector with coefficients in [-MLDSA_ETA, MLDSA_ETA]. * - * Arguments: - mld_polyveck *p: pointer to output polynomial vector - * - const uint8_t *r: input byte array with - * bit-packed polynomial vector - **************************************************/ + * @param[out] p Pointer to output polynomial vector. + * @param[in] r Input byte array with bit-packed polynomial vector. + */ MLD_INTERNAL_API void mld_polyveck_unpack_eta( mld_polyveck *p, const uint8_t r[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES]) @@ -620,106 +411,8 @@ __contract__( ensures(forall(k1, 0, MLDSA_K, array_bound(p->vec[k1].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || (!MLD_CONFIG_NO_SIGN_API && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST)) */ -#define mld_polyveck_unpack_t0 MLD_NAMESPACE_KL(polyveck_unpack_t0) -/************************************************* - * Name: mld_polyveck_unpack_t0 - * - * Description: Unpack polynomial vector with coefficients in - * ]-2^{MLDSA_D-1}, 2^{MLDSA_D-1}]. - * - * Arguments: - mld_polyveck *p: pointer to output polynomial vector - * - const uint8_t *r: input byte array with - * bit-packed polynomial vector - **************************************************/ -MLD_INTERNAL_API -void mld_polyveck_unpack_t0(mld_polyveck *p, - const uint8_t r[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES]) -__contract__( - requires(memory_no_alias(r, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) - requires(memory_no_alias(p, sizeof(mld_polyveck))) - assigns(memory_slice(p, sizeof(mld_polyveck))) - ensures(forall(k1, 0, MLDSA_K, - array_bound(p->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) -); - -#define mld_polymat_get_row MLD_NAMESPACE_KL(polymat_get_row) -/************************************************* - * Name: mld_polymat_get_row - * - * Description: Retrieve a pointer to a specific row of the matrix. - * In MLD_CONFIG_REDUCE_RAM mode, generates the row on-demand. - * - * Arguments: - mld_polymat *mat: pointer to matrix - * - unsigned int row: row index (must be < MLDSA_K) - * - * Returns pointer to the row (mld_polyvecl) - **************************************************/ -MLD_INTERNAL_API -const mld_polyvecl *mld_polymat_get_row(mld_polymat *mat, unsigned int row); - -#define mld_polyvec_matrix_expand MLD_NAMESPACE_KL(polyvec_matrix_expand) -/************************************************* - * Name: mld_polyvec_matrix_expand - * - * Description: Implementation of ExpandA. Generates matrix A with uniformly - * random coefficients a_{i,j} by performing rejection - * sampling on the output stream of SHAKE128(rho|j|i) - * - * Arguments: - mld_polymat *mat: pointer to output matrix - * - const uint8_t rho[]: byte array containing seed rho - **************************************************/ -MLD_INTERNAL_API -void mld_polyvec_matrix_expand(mld_polymat *mat, - const uint8_t rho[MLDSA_SEEDBYTES]) -__contract__( - requires(memory_no_alias(mat, sizeof(mld_polymat))) - requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) - assigns(memory_slice(mat, sizeof(mld_polymat))) - ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) -); - - - -#define mld_polyvec_matrix_pointwise_montgomery \ - MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery) -/************************************************* - * Name: mld_polyvec_matrix_pointwise_montgomery - * - * Description: Compute matrix-vector multiplication in NTT domain with - * pointwise multiplication and multiplication by 2^{-32}. - * Input matrix and vector must be in NTT domain representation. - * - * The first input "mat" must be the output of - * polyvec_matrix_expand() and so have coefficients in [0, Q-1] - * inclusive. - * - * The second input "v" is assumed to be output of an NTT, and - * hence must have coefficients bounded by [-9q+1, +9q-1] - * inclusive. - * - * Note: In MLD_CONFIG_REDUCE_RAM mode, mat cannot be const - * as rows are generated on-demand. - * - * Arguments: - mld_polyveck *t: pointer to output vector t - * - mld_polymat *mat: pointer to input matrix - * - const mld_polyvecl *v: pointer to input vector v - **************************************************/ -MLD_INTERNAL_API -void mld_polyvec_matrix_pointwise_montgomery(mld_polyveck *t, mld_polymat *mat, - const mld_polyvecl *v) -__contract__( - requires(memory_no_alias(t, sizeof(mld_polyveck))) - requires(memory_no_alias(mat, sizeof(mld_polymat))) - requires(memory_no_alias(v, sizeof(mld_polyvecl))) - requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - requires(forall(l1, 0, MLDSA_L, - array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - assigns(memory_slice(t, sizeof(mld_polyveck))) - ensures(forall(k0, 0, MLDSA_K, - array_abs_bound(t->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q))) -); #endif /* !MLD_POLYVEC_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.c b/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.c new file mode 100644 index 00000000000..6e0971fb619 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.c @@ -0,0 +1,308 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS204] + * FIPS 204 Module-Lattice-Based Digital Signature Standard + * National Institute of Standards and Technology + * https://csrc.nist.gov/pubs/fips/204/final + */ + +#include "polyvec_lazy.h" + +#include "debug.h" + +/* This namespacing is not done at the top to avoid a naming conflict + * with native backends, which are currently not yet namespaced. */ +#define mld_polymat_expand_entry MLD_ADD_PARAM_SET(mld_polymat_expand_entry) + +/** + * Sample a single matrix entry A[k][l] of ExpandA(rho) by rejection sampling + * from SHAKE128(rho|l|k), and apply the custom-order permutation when a + * native NTT backend is in use. + * + * The caller is expected to have copied rho into the first MLDSA_SEEDBYTES + * of seed_ext. This function writes the domain-separation bytes + * seed_ext[SEEDBYTES..+2] = {l, k} before sampling. + * + * @param[out] p Pointer to output polynomial. + * @param[in,out] seed_ext Seed buffer pre-filled with rho in the first + * MLDSA_SEEDBYTES; the final two bytes are + * overwritten. + * @param l Column index (inner, aka nonce low byte). + * @param k Row index (outer, aka nonce high byte). + */ +static MLD_INLINE void mld_polymat_expand_entry( + mld_poly *p, uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)], uint8_t l, + uint8_t k) +__contract__( + requires(memory_no_alias(p, sizeof(mld_poly))) + requires(memory_no_alias(seed_ext, MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2))) + assigns(memory_slice(p, sizeof(mld_poly))) + assigns(memory_slice(seed_ext, MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2))) + ensures(array_bound(p->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) +) +{ + seed_ext[MLDSA_SEEDBYTES + 0] = l; + seed_ext[MLDSA_SEEDBYTES + 1] = k; + mld_poly_uniform(p, seed_ext); + mld_poly_permute_bitrev_to_custom_optional(p); +} + +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) + +MLD_INTERNAL_API +void mld_polyvec_matrix_expand_eager(mld_polymat_eager *mat, + const uint8_t rho[MLDSA_SEEDBYTES]) +{ + unsigned int i, j; + MLD_ALIGN uint8_t seed_ext[4][MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; + + for (j = 0; j < 4; j++) + __loop__( + assigns(j, object_whole(seed_ext)) + invariant(j <= 4) + decreases(4 - j) + ) + { + mld_memcpy(seed_ext[j], rho, MLDSA_SEEDBYTES); + } + +#if !defined(MLD_CONFIG_SERIAL_FIPS202_ONLY) + /* Sample 4 matrix entries a time. */ + for (i = 0; i < (MLDSA_K * MLDSA_L / 4) * 4; i += 4) + __loop__( + assigns(i, j, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat_eager))) + invariant(i <= (MLDSA_K * MLDSA_L / 4) * 4 && i % 4 == 0) + /* vectors 0 .. i / MLDSA_L are completely sampled */ + invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + /* last vector is sampled up to i % MLDSA_L */ + invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L, + array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + decreases((MLDSA_K * MLDSA_L / 4) * 4 - i) + ) + { + for (j = 0; j < 4; j++) + __loop__( + assigns(j, object_whole(seed_ext)) + invariant(j <= 4) + decreases(4 - j) + ) + { + uint8_t x = (uint8_t)((i + j) / MLDSA_L); + uint8_t y = (uint8_t)((i + j) % MLDSA_L); + + seed_ext[j][MLDSA_SEEDBYTES + 0] = y; + seed_ext[j][MLDSA_SEEDBYTES + 1] = x; + } + + mld_poly_uniform_4x(&mat->vec[i / MLDSA_L].vec[i % MLDSA_L], + &mat->vec[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L], + &mat->vec[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L], + &mat->vec[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L], + seed_ext); + mld_poly_permute_bitrev_to_custom_optional( + &mat->vec[i / MLDSA_L].vec[i % MLDSA_L]); + mld_poly_permute_bitrev_to_custom_optional( + &mat->vec[(i + 1) / MLDSA_L].vec[(i + 1) % MLDSA_L]); + mld_poly_permute_bitrev_to_custom_optional( + &mat->vec[(i + 2) / MLDSA_L].vec[(i + 2) % MLDSA_L]); + mld_poly_permute_bitrev_to_custom_optional( + &mat->vec[(i + 3) / MLDSA_L].vec[(i + 3) % MLDSA_L]); + } +#else /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */ + i = 0; +#endif /* MLD_CONFIG_SERIAL_FIPS202_ONLY */ + + /* Entries omitted by the batch-sampling are sampled individually. */ + while (i < MLDSA_K * MLDSA_L) + __loop__( + assigns(i, object_whole(seed_ext), memory_slice(mat, sizeof(mld_polymat_eager))) + invariant(i <= MLDSA_K * MLDSA_L) + /* vectors 0 .. i / MLDSA_L are completely sampled */ + invariant(forall(k1, 0, i / MLDSA_L, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + /* last vector is sampled up to i % MLDSA_L */ + invariant(forall(k2, i / MLDSA_L, i / MLDSA_L + 1, forall(l2, 0, i % MLDSA_L, + array_bound(mat->vec[k2].vec[l2].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + decreases(MLDSA_K * MLDSA_L - i) + ) + { + uint8_t x = (uint8_t)(i / MLDSA_L); + uint8_t y = (uint8_t)(i % MLDSA_L); + mld_polymat_expand_entry(&mat->vec[x].vec[y], seed_ext[0], y, x); + i++; + } + + /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ + mld_zeroize(seed_ext, sizeof(seed_ext)); +} + +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_row_eager(mld_poly *t_row, + mld_polymat_eager *mat, + const mld_polyvecl *v, + unsigned int i) +{ + mld_polyvecl_pointwise_acc_montgomery(t_row, &mat->vec[i], v); +} + +#if !defined(MLD_CONFIG_NO_SIGN_API) +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_yvec_eager(mld_polyveck *w, + mld_polymat_eager *mat, + const mld_yvec_eager *y, + mld_polyvecl *scratch) +{ + unsigned int i; + *scratch = y->vec; + mld_polyvecl_ntt(scratch); + + for (i = 0; i < MLDSA_K; ++i) + __loop__( + assigns(i, memory_slice(w, sizeof(mld_polyveck))) + invariant(i <= MLDSA_K) + invariant(forall(k0, 0, i, + array_abs_bound(w->vec[k0].coeffs, 0, MLDSA_N, MLDSA_Q))) + decreases(MLDSA_K - i) + ) + { + mld_polyvec_matrix_pointwise_montgomery_row_eager(&w->vec[i], mat, scratch, + i); + } + + mld_polyveck_invntt_tomont(w); +} +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) + +MLD_INTERNAL_API +void mld_polyvec_matrix_expand_lazy(mld_polymat_lazy *mat, + const uint8_t rho[MLDSA_SEEDBYTES]) +{ + mld_memcpy(mat->rho, rho, MLDSA_SEEDBYTES); +} + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_VERIFY_API) +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_row_lazy(mld_poly *t_row, + mld_polymat_lazy *mat, + const mld_polyvecl *v, + unsigned int i) +{ + unsigned int l; + MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; + mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES); + + mld_polymat_expand_entry(t_row, seed_ext, 0, (uint8_t)i); + mld_poly_pointwise_montgomery(t_row, &v->vec[0]); + + for (l = 1; l < MLDSA_L; ++l) + __loop__( + assigns(l, object_whole(seed_ext), + memory_slice(t_row, sizeof(mld_poly)), + memory_slice(mat, sizeof(mld_polymat_lazy))) + invariant(l >= 1 && l <= MLDSA_L) + invariant(array_abs_bound(t_row->coeffs, 0, MLDSA_N, l * MLDSA_Q)) + decreases(MLDSA_L - l) + ) + { + mld_polymat_expand_entry(&mat->cur, seed_ext, (uint8_t)l, (uint8_t)i); + mld_poly_pointwise_montgomery(&mat->cur, &v->vec[l]); + mld_poly_add(t_row, &mat->cur); + } + mld_poly_reduce(t_row); + + /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ + mld_zeroize(seed_ext, sizeof(seed_ext)); +} +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_yvec_lazy(mld_polyveck *w, + mld_polymat_lazy *mat, + const mld_yvec_lazy *y, + mld_polyvecl *scratch) +{ + unsigned int k, l; + MLD_ALIGN uint8_t seed_ext[MLD_ALIGN_UP(MLDSA_SEEDBYTES + 2)]; + /* Only the first poly of the polyvecl scratch is used. The polyvecl type + * matches the eager variant for API uniformity; in REDUCE_RAM mode the + * polyvecl storage is provided "for free" by the caller's polyveck/polyvecl + * union. */ + mld_poly *y_ntt = &scratch->vec[0]; + + mld_memcpy(seed_ext, mat->rho, MLDSA_SEEDBYTES); + + /* Column-by-column: sample y[l], NTT, accumulate column l of A into w. */ + for (l = 0; l < MLDSA_L; l++) + __loop__( + assigns(k, l, object_whole(seed_ext), + memory_slice(w, sizeof(mld_polyveck)), + memory_slice(mat, sizeof(mld_polymat_lazy)), + memory_slice(scratch, sizeof(mld_polyvecl))) + invariant(l <= MLDSA_L) + invariant(l == 0 || + forall(k0, 0, MLDSA_K, + array_abs_bound(w->vec[k0].coeffs, 0, MLDSA_N, + (int)l * MLDSA_Q))) + decreases(MLDSA_L - l) + ) + { + mld_yvec_get_poly_lazy(y_ntt, y, l); + mld_poly_ntt(y_ntt); + for (k = 0; k < MLDSA_K; k++) + __loop__( + assigns(k, object_whole(seed_ext), + memory_slice(w, sizeof(mld_polyveck)), + memory_slice(mat, sizeof(mld_polymat_lazy))) + invariant(k <= MLDSA_K) + invariant(l != 0 || + forall(k1, 0, k, + array_abs_bound(w->vec[k1].coeffs, 0, MLDSA_N, MLDSA_Q))) + invariant(l == 0 || + forall(k2, 0, k, + array_abs_bound(w->vec[k2].coeffs, 0, MLDSA_N, + ((int)l + 1) * MLDSA_Q))) + invariant(l == 0 || + forall(k3, k, MLDSA_K, + array_abs_bound(w->vec[k3].coeffs, 0, MLDSA_N, + (int)l * MLDSA_Q))) + decreases(MLDSA_K - k) + ) + { + if (l == 0) + { + mld_polymat_expand_entry(&w->vec[k], seed_ext, 0, (uint8_t)k); + mld_poly_pointwise_montgomery(&w->vec[k], y_ntt); + } + else + { + mld_polymat_expand_entry(&mat->cur, seed_ext, (uint8_t)l, (uint8_t)k); + mld_poly_pointwise_montgomery(&mat->cur, y_ntt); + mld_poly_add(&w->vec[k], &mat->cur); + } + } + } + + /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ + mld_zeroize(seed_ext, sizeof(seed_ext)); + mld_polyveck_reduce(w); + mld_polyveck_invntt_tomont(w); +} +#endif /* !MLD_CONFIG_NO_SIGN_API */ + +#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef mld_polymat_expand_entry diff --git a/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.h b/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.h new file mode 100644 index 00000000000..8149220bc66 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/polyvec_lazy.h @@ -0,0 +1,653 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS204] + * FIPS 204 Module-Lattice-Based Digital Signature Standard + * National Institute of Standards and Technology + * https://csrc.nist.gov/pubs/fips/204/final + */ + +/* + * Eager and lazy variants of polynomial vector types. + * + * In eager mode, full vectors are precomputed and stored in memory. + * In lazy mode, data is stored in packed form and expanded on demand, + * trading computation for reduced memory usage. + * + * MLD_CONFIG_REDUCE_RAM selects which variant is used. + */ + +#ifndef MLD_POLYVEC_LAZY_H +#define MLD_POLYVEC_LAZY_H + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API) + +#include "poly.h" +#include "poly_kl.h" +#include "polyvec.h" + +/* Parameter set namespacing */ +#define mld_sk_s1hat_eager MLD_ADD_PARAM_SET(mld_sk_s1hat_eager) +#define mld_sk_s1hat_lazy MLD_ADD_PARAM_SET(mld_sk_s1hat_lazy) +#define mld_sk_s1hat MLD_ADD_PARAM_SET(mld_sk_s1hat) +#define mld_unpack_sk_s1hat_eager MLD_ADD_PARAM_SET(mld_unpack_sk_s1hat_eager) +#define mld_unpack_sk_s1hat_lazy MLD_ADD_PARAM_SET(mld_unpack_sk_s1hat_lazy) +#define mld_sk_s1hat_get_poly_eager \ + MLD_ADD_PARAM_SET(mld_sk_s1hat_get_poly_eager) +#define mld_sk_s1hat_get_poly_lazy MLD_ADD_PARAM_SET(mld_sk_s1hat_get_poly_lazy) +#define mld_sk_s2hat_eager MLD_ADD_PARAM_SET(mld_sk_s2hat_eager) +#define mld_sk_s2hat_lazy MLD_ADD_PARAM_SET(mld_sk_s2hat_lazy) +#define mld_sk_s2hat MLD_ADD_PARAM_SET(mld_sk_s2hat) +#define mld_unpack_sk_s2hat_eager MLD_ADD_PARAM_SET(mld_unpack_sk_s2hat_eager) +#define mld_unpack_sk_s2hat_lazy MLD_ADD_PARAM_SET(mld_unpack_sk_s2hat_lazy) +#define mld_sk_s2hat_get_poly_eager \ + MLD_ADD_PARAM_SET(mld_sk_s2hat_get_poly_eager) +#define mld_sk_s2hat_get_poly_lazy MLD_ADD_PARAM_SET(mld_sk_s2hat_get_poly_lazy) +#define mld_sk_t0hat_eager MLD_ADD_PARAM_SET(mld_sk_t0hat_eager) +#define mld_sk_t0hat_lazy MLD_ADD_PARAM_SET(mld_sk_t0hat_lazy) +#define mld_sk_t0hat MLD_ADD_PARAM_SET(mld_sk_t0hat) +#define mld_unpack_sk_t0hat_eager MLD_ADD_PARAM_SET(mld_unpack_sk_t0hat_eager) +#define mld_unpack_sk_t0hat_lazy MLD_ADD_PARAM_SET(mld_unpack_sk_t0hat_lazy) +#define mld_sk_t0hat_get_poly_eager \ + MLD_ADD_PARAM_SET(mld_sk_t0hat_get_poly_eager) +#define mld_sk_t0hat_get_poly_lazy MLD_ADD_PARAM_SET(mld_sk_t0hat_get_poly_lazy) +#define mld_polymat MLD_ADD_PARAM_SET(mld_polymat) +#define mld_polymat_eager MLD_ADD_PARAM_SET(mld_polymat_eager) +#define mld_polymat_lazy MLD_ADD_PARAM_SET(mld_polymat_lazy) +#define mld_poly_permute_bitrev_to_custom_optional \ + MLD_ADD_PARAM_SET(mld_poly_permute_bitrev_to_custom_optional) +#define mld_polyvec_matrix_expand_eager \ + MLD_NAMESPACE_KL(polyvec_matrix_expand_eager) +#define mld_polyvec_matrix_expand_lazy \ + MLD_NAMESPACE_KL(polyvec_matrix_expand_lazy) +#define mld_polyvec_matrix_pointwise_montgomery \ + MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery) +#define mld_polyvec_matrix_pointwise_montgomery_row_eager \ + MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_row_eager) +#define mld_polyvec_matrix_pointwise_montgomery_row_lazy \ + MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_row_lazy) +#define mld_polyvec_matrix_pointwise_montgomery_yvec_eager \ + MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_yvec_eager) +#define mld_polyvec_matrix_pointwise_montgomery_yvec_lazy \ + MLD_NAMESPACE_KL(polyvec_matrix_pointwise_montgomery_yvec_lazy) +#define mld_yvec_eager MLD_ADD_PARAM_SET(mld_yvec_eager) +#define mld_yvec_lazy MLD_ADD_PARAM_SET(mld_yvec_lazy) +#define mld_yvec MLD_ADD_PARAM_SET(mld_yvec) +#define mld_yvec_init_eager MLD_ADD_PARAM_SET(mld_yvec_init_eager) +#define mld_yvec_init_lazy MLD_ADD_PARAM_SET(mld_yvec_init_lazy) +#define mld_yvec_get_poly_eager MLD_ADD_PARAM_SET(mld_yvec_get_poly_eager) +#define mld_yvec_get_poly_lazy MLD_ADD_PARAM_SET(mld_yvec_get_poly_lazy) +/* End of parameter set namespacing */ + +/** Eager s1hat: precomputed s1 vector in NTT domain. */ +typedef struct +{ + mld_polyvecl vec; /**< s1 vector in NTT domain. */ +} mld_sk_s1hat_eager; + +/** Eager s2hat: precomputed s2 vector in NTT domain. */ +typedef struct +{ + mld_polyveck vec; /**< s2 vector in NTT domain. */ +} mld_sk_s2hat_eager; + +/** Eager t0hat: precomputed t0 vector in NTT domain. */ +typedef struct +{ + mld_polyveck vec; /**< t0 vector in NTT domain. */ +} mld_sk_t0hat_eager; + +/** Lazy s1hat: borrow packed s1, unpack and convert to NTT domain on demand. */ +typedef struct +{ + const uint8_t *packed; /**< Pointer to packed s1 in the secret key. */ +} mld_sk_s1hat_lazy; + +/** Lazy s2hat: borrow packed s2, unpack and convert to NTT domain on demand. */ +typedef struct +{ + const uint8_t *packed; /**< Pointer to packed s2 in the secret key. */ +} mld_sk_s2hat_lazy; + +/** Lazy t0hat: borrow packed t0, unpack and convert to NTT domain on demand. */ +typedef struct +{ + const uint8_t *packed; /**< Pointer to packed t0 in the secret key. */ +} mld_sk_t0hat_lazy; + +/** Eager yvec: precomputed and stored full signing masking vector y. */ +typedef struct +{ + mld_polyvecl vec; /**< Masking vector y. */ +} mld_yvec_eager; + +/** Lazy yvec: store seed and nonce, regenerate y[i] on demand. */ +typedef struct +{ + const uint8_t *rhoprime; /**< Pointer to seed used to derive y. */ + uint16_t nonce; /**< Base nonce; component i uses MLDSA_L*nonce + i. */ +} mld_yvec_lazy; + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_SIGN_API) +/* s1vec */ + +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +static MLD_INLINE void mld_unpack_sk_s1hat_eager( + mld_sk_s1hat_eager *s1, + const uint8_t packed_s1[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(s1, sizeof(mld_sk_s1hat_eager))) + requires(memory_no_alias(packed_s1, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) + assigns(memory_slice(s1, sizeof(mld_sk_s1hat_eager))) + ensures(forall(k1, 0, MLDSA_L, + array_abs_bound(s1->vec.vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) +) +{ + mld_polyvecl_unpack_eta(&s1->vec, packed_s1); + mld_polyvecl_ntt(&s1->vec); +} + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_s1hat_get_poly_eager(mld_poly *buf, + const mld_sk_s1hat_eager *s1, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(s1, sizeof(mld_sk_s1hat_eager))) + requires(i < MLDSA_L) + requires(array_abs_bound(s1->vec.vec[i].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) { *buf = s1->vec.vec[i]; } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +static MLD_INLINE void mld_unpack_sk_s1hat_lazy( + mld_sk_s1hat_lazy *s1, + const uint8_t packed_s1[MLDSA_L * MLDSA_POLYETA_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(s1, sizeof(mld_sk_s1hat_lazy))) + assigns(memory_slice(s1, sizeof(mld_sk_s1hat_lazy))) + ensures(s1->packed == old(packed_s1)) +) { s1->packed = packed_s1; } + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_s1hat_get_poly_lazy(mld_poly *buf, + const mld_sk_s1hat_lazy *s1, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(s1, sizeof(mld_sk_s1hat_lazy))) + requires(i < MLDSA_L) + requires(memory_no_alias(s1->packed, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) +{ + mld_polyeta_unpack(buf, s1->packed + i * MLDSA_POLYETA_PACKEDBYTES); + mld_poly_ntt(buf); +} +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +/* s2vec */ + +#if (!defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST)) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) +static MLD_INLINE void mld_unpack_sk_s2hat_eager( + mld_sk_s2hat_eager *s2, + const uint8_t packed_s2[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(s2, sizeof(mld_sk_s2hat_eager))) + requires(memory_no_alias(packed_s2, MLDSA_K * MLDSA_POLYETA_PACKEDBYTES)) + assigns(memory_slice(s2, sizeof(mld_sk_s2hat_eager))) + ensures(forall(k1, 0, MLDSA_K, + array_abs_bound(s2->vec.vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) +) +{ + mld_polyveck_unpack_eta(&s2->vec, packed_s2); + mld_polyveck_ntt(&s2->vec); +} + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_s2hat_get_poly_eager(mld_poly *buf, + const mld_sk_s2hat_eager *s2, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(s2, sizeof(mld_sk_s2hat_eager))) + requires(i < MLDSA_K) + requires(array_abs_bound(s2->vec.vec[i].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) { *buf = s2->vec.vec[i]; } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* (!MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST) && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) */ +#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +static MLD_INLINE void mld_unpack_sk_s2hat_lazy( + mld_sk_s2hat_lazy *s2, + const uint8_t packed_s2[MLDSA_K * MLDSA_POLYETA_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(s2, sizeof(mld_sk_s2hat_lazy))) + assigns(memory_slice(s2, sizeof(mld_sk_s2hat_lazy))) + ensures(s2->packed == old(packed_s2)) +) { s2->packed = packed_s2; } + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_s2hat_get_poly_lazy(mld_poly *buf, + const mld_sk_s2hat_lazy *s2, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(s2, sizeof(mld_sk_s2hat_lazy))) + requires(i < MLDSA_K) + requires(memory_no_alias(s2->packed, MLDSA_K * MLDSA_POLYETA_PACKEDBYTES)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) +{ + mld_polyeta_unpack(buf, s2->packed + i * MLDSA_POLYETA_PACKEDBYTES); + mld_poly_ntt(buf); +} +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +/* t0vec */ + +#if (!defined(MLD_CONFIG_NO_SIGN_API) || defined(MLD_UNIT_TEST)) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) +static MLD_INLINE void mld_unpack_sk_t0hat_eager( + mld_sk_t0hat_eager *t0, + const uint8_t packed_t0[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(t0, sizeof(mld_sk_t0hat_eager))) + requires(memory_no_alias(packed_t0, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) + assigns(memory_slice(t0, sizeof(mld_sk_t0hat_eager))) + ensures(forall(k1, 0, MLDSA_K, + array_abs_bound(t0->vec.vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) +) +{ + unsigned int i; + for (i = 0; i < MLDSA_K; ++i) + __loop__( + assigns(i, memory_slice(t0, sizeof(mld_sk_t0hat_eager))) + invariant(i <= MLDSA_K) + invariant(forall(k0, 0, i, + array_bound(t0->vec.vec[k0].coeffs, 0, MLDSA_N, + -(1 << (MLDSA_D - 1)) + 1, (1 << (MLDSA_D - 1)) + 1))) + decreases(MLDSA_K - i) + ) + { + mld_polyt0_unpack(&t0->vec.vec[i], + packed_t0 + i * MLDSA_POLYT0_PACKEDBYTES); + } + mld_polyveck_ntt(&t0->vec); +} + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_t0hat_get_poly_eager(mld_poly *buf, + const mld_sk_t0hat_eager *t0, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(t0, sizeof(mld_sk_t0hat_eager))) + requires(i < MLDSA_K) + requires(array_abs_bound(t0->vec.vec[i].coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) { *buf = t0->vec.vec[i]; } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* (!MLD_CONFIG_NO_SIGN_API || MLD_UNIT_TEST) && \ + (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) */ +#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +static MLD_INLINE void mld_unpack_sk_t0hat_lazy( + mld_sk_t0hat_lazy *t0, + const uint8_t packed_t0[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES]) +__contract__( + requires(memory_no_alias(t0, sizeof(mld_sk_t0hat_lazy))) + assigns(memory_slice(t0, sizeof(mld_sk_t0hat_lazy))) + ensures(t0->packed == old(packed_t0)) +) { t0->packed = packed_t0; } + +#if !defined(MLD_CONFIG_NO_SIGN_API) +static MLD_INLINE void mld_sk_t0hat_get_poly_lazy(mld_poly *buf, + const mld_sk_t0hat_lazy *t0, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(t0, sizeof(mld_sk_t0hat_lazy))) + requires(i < MLDSA_K) + requires(memory_no_alias(t0->packed, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_abs_bound(buf->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) +) +{ + mld_polyt0_unpack(buf, t0->packed + i * MLDSA_POLYT0_PACKEDBYTES); + mld_poly_ntt(buf); +} +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API */ + +/* yvec */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) && \ + (!defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) +static MLD_INLINE void mld_yvec_init_eager( + mld_yvec_eager *y, const uint8_t rhoprime[MLDSA_CRHBYTES], uint16_t nonce) +__contract__( + requires(memory_no_alias(y, sizeof(mld_yvec_eager))) + requires(memory_no_alias(rhoprime, MLDSA_CRHBYTES)) + requires(nonce <= (UINT16_MAX - MLDSA_L) / MLDSA_L) + assigns(memory_slice(y, sizeof(mld_yvec_eager))) + ensures(forall(k1, 0, MLDSA_L, + array_bound(y->vec.vec[k1].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) +) +{ + mld_polyvecl_uniform_gamma1(&y->vec, rhoprime, nonce); +} + +static MLD_INLINE void mld_yvec_get_poly_eager(mld_poly *buf, + const mld_yvec_eager *y, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(y, sizeof(mld_yvec_eager))) + requires(i < MLDSA_L) + requires(array_bound(y->vec.vec[i].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_bound(buf->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) +) { *buf = y->vec.vec[i]; } +#endif /* !MLD_CONFIG_NO_SIGN_API && (!MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) \ + */ +#if !defined(MLD_CONFIG_NO_SIGN_API) && \ + (defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST)) +static MLD_INLINE void mld_yvec_init_lazy( + mld_yvec_lazy *y, const uint8_t rhoprime[MLDSA_CRHBYTES], uint16_t nonce) +__contract__( + requires(memory_no_alias(y, sizeof(mld_yvec_lazy))) + assigns(memory_slice(y, sizeof(mld_yvec_lazy))) + ensures(y->rhoprime == old(rhoprime)) + ensures(y->nonce == old(nonce)) +) +{ + y->rhoprime = rhoprime; + y->nonce = nonce; +} + +static MLD_INLINE void mld_yvec_get_poly_lazy(mld_poly *buf, + const mld_yvec_lazy *y, + unsigned int i) +__contract__( + requires(memory_no_alias(buf, sizeof(mld_poly))) + requires(memory_no_alias(y, sizeof(mld_yvec_lazy))) + requires(i < MLDSA_L) + requires(memory_no_alias(y->rhoprime, MLDSA_CRHBYTES)) + requires(y->nonce <= ((UINT16_MAX - MLDSA_L) / MLDSA_L)) + assigns(memory_slice(buf, sizeof(mld_poly))) + ensures(array_bound(buf->coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1)) +) +{ + /* Safety: y->nonce is at most ((UINT16_MAX - MLDSA_L) / MLDSA_L) and + * i < MLDSA_L, so MLDSA_L * y->nonce + i fits in uint16_t. See + * MLD_NONCE_UB comment in sign.c. */ + mld_poly_uniform_gamma1(buf, y->rhoprime, + (uint16_t)(MLDSA_L * y->nonce + (int)i)); +} +#endif /* !MLD_CONFIG_NO_SIGN_API && (MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST) \ + */ + +/* polymat */ + +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +/** Eager polymat: precomputed and stored full MLDSA_K x MLDSA_L matrix. */ +typedef struct +{ + mld_polyvecl vec[MLDSA_K]; /**< Rows of the matrix. */ +} mld_polymat_eager; +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +/** Lazy polymat: store seed, sample elements A[k][l] on demand. */ +typedef struct +{ + mld_poly cur; /**< On-demand sampled matrix element A[k][l]. */ + uint8_t rho[MLDSA_SEEDBYTES]; /**< Public seed used to expand A. */ +} mld_polymat_lazy; + +static MLD_INLINE void mld_poly_permute_bitrev_to_custom_optional(mld_poly *p) +__contract__( + /* We don't specify that this is a permutation, only that it preserves + * the bounds. + * When the native NTT backend does not use the custom order, this is a no-op. */ + requires(memory_no_alias(p, sizeof(mld_poly))) + requires(array_bound(p->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) + assigns(memory_slice(p, sizeof(mld_poly))) + ensures(array_bound(p->coeffs, 0, MLDSA_N, 0, MLDSA_Q)) +) +{ +#if defined(MLD_USE_NATIVE_NTT_CUSTOM_ORDER) + mld_poly_permute_bitrev_to_custom(p->coeffs); +#else + (void)p; +#endif +} + +#if !defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +/** + * Implementation of ExpandA. Generates matrix A with uniformly random + * coefficients a_{i,j} by performing rejection sampling on the output stream + * of SHAKE128(rho|j|i). + * + * @param[out] mat Pointer to output matrix. + * @param[in] rho Byte array containing seed rho. + */ +MLD_INTERNAL_API +void mld_polyvec_matrix_expand_eager(mld_polymat_eager *mat, + const uint8_t rho[MLDSA_SEEDBYTES]) +__contract__( + requires(memory_no_alias(mat, sizeof(mld_polymat_eager))) + requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) + assigns(memory_slice(mat, sizeof(mld_polymat_eager))) + ensures(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) +); + +/** + * Compute row i of matrix-vector multiplication in NTT domain with pointwise + * multiplication and multiplication by 2^{-32}. + * + * Input matrix and vector must be in NTT domain representation. Output + * coefficients are bounded by MLDSA_Q in absolute value. + * + * @param[out] t_row Pointer to output row polynomial. + * @param[in] mat Pointer to input matrix. + * @param[in] v Pointer to input vector v. + * @param i Row index, 0 <= i < MLDSA_K. + */ +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_row_eager(mld_poly *t_row, + mld_polymat_eager *mat, + const mld_polyvecl *v, + unsigned int i) +__contract__( + requires(memory_no_alias(t_row, sizeof(mld_poly))) + requires(memory_no_alias(mat, sizeof(mld_polymat_eager))) + requires(memory_no_alias(v, sizeof(mld_polyvecl))) + requires(i < MLDSA_K) + requires(forall(l1, 0, MLDSA_L, + array_bound(mat->vec[i].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q))) + requires(forall(l2, 0, MLDSA_L, + array_abs_bound(v->vec[l2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + assigns(memory_slice(t_row, sizeof(mld_poly))) + ensures(array_abs_bound(t_row->coeffs, 0, MLDSA_N, MLDSA_Q)) +); + +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * Compute w = invNTT(A * NTT(y)) for the signing y vector. + * + * The eager variant copies y into the scratch polyvecl, NTTs it in place, + * calls the standard matrix-vector multiply, and finally inverse-NTTs the + * result into w. + * + * @param[out] w Pointer to output vector. + * @param[in] mat Pointer to input matrix. + * @param[in] y Pointer to (non-NTT) y vector. + * @param[out] scratch Scratch polyvecl for NTT'd copy of y. + */ +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_yvec_eager(mld_polyveck *w, + mld_polymat_eager *mat, + const mld_yvec_eager *y, + mld_polyvecl *scratch) +__contract__( + requires(memory_no_alias(w, sizeof(mld_polyveck))) + requires(memory_no_alias(mat, sizeof(mld_polymat_eager))) + requires(memory_no_alias(y, sizeof(mld_yvec_eager))) + requires(memory_no_alias(scratch, sizeof(mld_polyvecl))) + requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + requires(forall(l2, 0, MLDSA_L, + array_bound(y->vec.vec[l2].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) + assigns(memory_slice(w, sizeof(mld_polyveck))) + assigns(memory_slice(scratch, sizeof(mld_polyvecl))) + ensures(forall(k0, 0, MLDSA_K, + array_abs_bound(w->vec[k0].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))) +); +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* !MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +#if defined(MLD_CONFIG_REDUCE_RAM) || defined(MLD_UNIT_TEST) +MLD_INTERNAL_API +void mld_polyvec_matrix_expand_lazy(mld_polymat_lazy *mat, + const uint8_t rho[MLDSA_SEEDBYTES]) +__contract__( + requires(memory_no_alias(mat, sizeof(mld_polymat_lazy))) + requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) + assigns(memory_slice(mat, sizeof(mld_polymat_lazy))) +); + +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) || !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * Compute row i of matrix-vector multiplication in NTT domain with pointwise + * multiplication and multiplication by 2^{-32}. + * + * Input vector must be in NTT domain representation; the matrix entries are + * sampled on demand from the seed stored in mat->rho, using mat->cur as + * scratch. Output coefficients are bounded by MLDSA_Q in absolute value. + * + * @param[out] t_row Pointer to output row polynomial. + * @param[in,out] mat Pointer to input matrix (seed + scratch). + * @param[in] v Pointer to input vector v. + * @param i Row index, 0 <= i < MLDSA_K. + */ +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_row_lazy(mld_poly *t_row, + mld_polymat_lazy *mat, + const mld_polyvecl *v, + unsigned int i) +__contract__( + requires(memory_no_alias(t_row, sizeof(mld_poly))) + requires(memory_no_alias(mat, sizeof(mld_polymat_lazy))) + requires(memory_no_alias(v, sizeof(mld_polyvecl))) + requires(i < MLDSA_K) + requires(forall(l1, 0, MLDSA_L, + array_abs_bound(v->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + assigns(memory_slice(t_row, sizeof(mld_poly))) + assigns(memory_slice(mat, sizeof(mld_polymat_lazy))) + ensures(array_abs_bound(t_row->coeffs, 0, MLDSA_N, MLDSA_Q)) +); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_VERIFY_API */ + +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * Compute w = invNTT(A * NTT(y)) for the signing y vector. + * + * The lazy variant samples one column of y at a time, NTTs it into + * &scratch->vec[0], and accumulates the matrix-vector product + * column-by-column with on-demand sampling of A[k][l]. Only the first poly of + * the polyvecl scratch is used; the polyvecl type is shared with the eager + * variant for API uniformity (the storage is provided "for free" by the + * caller's polyveck/polyvecl union in REDUCE_RAM mode). + * + * @param[out] w Pointer to output vector. + * @param[in,out] mat Pointer to input matrix. + * @param[in] y Pointer to y seed/nonce. + * @param[out] scratch Scratch (only &scratch->vec[0] used). + */ +MLD_INTERNAL_API +void mld_polyvec_matrix_pointwise_montgomery_yvec_lazy(mld_polyveck *w, + mld_polymat_lazy *mat, + const mld_yvec_lazy *y, + mld_polyvecl *scratch) +__contract__( + requires(memory_no_alias(w, sizeof(mld_polyveck))) + requires(memory_no_alias(mat, sizeof(mld_polymat_lazy))) + requires(memory_no_alias(y, sizeof(mld_yvec_lazy))) + requires(memory_no_alias(scratch, sizeof(mld_polyvecl))) + requires(memory_no_alias(y->rhoprime, MLDSA_CRHBYTES)) + requires(y->nonce <= ((UINT16_MAX - MLDSA_L) / MLDSA_L)) + assigns(memory_slice(w, sizeof(mld_polyveck))) + assigns(memory_slice(mat, sizeof(mld_polymat_lazy))) + assigns(memory_slice(scratch, sizeof(mld_polyvecl))) + ensures(forall(k0, 0, MLDSA_K, + array_abs_bound(w->vec[k0].coeffs, 0, MLDSA_N, MLD_INTT_BOUND))) +); +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#endif /* MLD_CONFIG_REDUCE_RAM || MLD_UNIT_TEST */ + +/* Dispatch: typedef and define based on MLD_CONFIG_REDUCE_RAM */ +#if defined(MLD_CONFIG_REDUCE_RAM) +typedef mld_sk_s1hat_lazy mld_sk_s1hat; +typedef mld_sk_s2hat_lazy mld_sk_s2hat; +typedef mld_sk_t0hat_lazy mld_sk_t0hat; +typedef mld_polymat_lazy mld_polymat; +typedef mld_yvec_lazy mld_yvec; +#define mld_unpack_sk_s1hat mld_unpack_sk_s1hat_lazy +#define mld_unpack_sk_s2hat mld_unpack_sk_s2hat_lazy +#define mld_unpack_sk_t0hat mld_unpack_sk_t0hat_lazy +#if !defined(MLD_CONFIG_NO_SIGN_API) +#define mld_sk_s1hat_get_poly mld_sk_s1hat_get_poly_lazy +#define mld_sk_s2hat_get_poly mld_sk_s2hat_get_poly_lazy +#define mld_sk_t0hat_get_poly mld_sk_t0hat_get_poly_lazy +#endif +#define mld_polyvec_matrix_expand mld_polyvec_matrix_expand_lazy +#define mld_polyvec_matrix_pointwise_montgomery_row \ + mld_polyvec_matrix_pointwise_montgomery_row_lazy +#define mld_yvec_init mld_yvec_init_lazy +#define mld_yvec_get_poly mld_yvec_get_poly_lazy +#define mld_polyvec_matrix_pointwise_montgomery_yvec \ + mld_polyvec_matrix_pointwise_montgomery_yvec_lazy +#else /* MLD_CONFIG_REDUCE_RAM */ +typedef mld_sk_s1hat_eager mld_sk_s1hat; +typedef mld_sk_s2hat_eager mld_sk_s2hat; +typedef mld_sk_t0hat_eager mld_sk_t0hat; +typedef mld_polymat_eager mld_polymat; +typedef mld_yvec_eager mld_yvec; +#define mld_unpack_sk_s1hat mld_unpack_sk_s1hat_eager +#define mld_unpack_sk_s2hat mld_unpack_sk_s2hat_eager +#define mld_unpack_sk_t0hat mld_unpack_sk_t0hat_eager +#if !defined(MLD_CONFIG_NO_SIGN_API) +#define mld_sk_s2hat_get_poly mld_sk_s2hat_get_poly_eager +#define mld_sk_s1hat_get_poly mld_sk_s1hat_get_poly_eager +#define mld_sk_t0hat_get_poly mld_sk_t0hat_get_poly_eager +#endif +#define mld_polyvec_matrix_expand mld_polyvec_matrix_expand_eager +#define mld_polyvec_matrix_pointwise_montgomery_row \ + mld_polyvec_matrix_pointwise_montgomery_row_eager +#define mld_yvec_init mld_yvec_init_eager +#define mld_yvec_get_poly mld_yvec_get_poly_eager +#define mld_polyvec_matrix_pointwise_montgomery_yvec \ + mld_polyvec_matrix_pointwise_montgomery_yvec_eager +#endif /* !MLD_CONFIG_REDUCE_RAM */ + +#endif /* !MLD_CONFIG_NO_KEYPAIR_API || !MLD_CONFIG_NO_SIGN_API || \ + !MLD_CONFIG_NO_VERIFY_API */ +#endif /* !MLD_POLYVEC_LAZY_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h b/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h index da557724190..ee34dc9b487 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/randombytes.h @@ -6,7 +6,6 @@ #define MLD_RANDOMBYTES_H #include -#include #include "cbmc.h" #include "common.h" @@ -15,6 +14,8 @@ #if !defined(MLD_CONFIG_CUSTOM_RANDOMBYTES) MLD_MUST_CHECK_RETURN_VALUE int randombytes(uint8_t *out, size_t outlen); + +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_randombytes(uint8_t *out, size_t outlen) __contract__( requires(memory_no_alias(out, outlen)) diff --git a/crypto/fipsmodule/ml_dsa/mldsa/reduce.h b/crypto/fipsmodule/ml_dsa/mldsa/reduce.h index 413cf1829ef..bb86d07056d 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/reduce.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/reduce.h @@ -5,7 +5,6 @@ #ifndef MLD_REDUCE_H #define MLD_REDUCE_H -#include #include "cbmc.h" #include "common.h" #include "ct.h" @@ -21,21 +20,19 @@ /* check-magic: 6283009 == (MLD_REDUCE32_DOMAIN_MAX - 255 * MLDSA_Q + 1) */ #define MLD_REDUCE32_RANGE_MAX 6283009 -/************************************************* - * Name: mld_montgomery_reduce +/** + * Generic Montgomery reduction; given a 64-bit integer a, computes a 32-bit + * integer congruent to a * R^-1 mod MLDSA_Q, where R=2^32. * - * Description: Generic Montgomery reduction; given a 64-bit integer a, computes - * 32-bit integer congruent to a * R^-1 mod q, where R=2^32 + * @param a Input integer to be reduced, of absolute value smaller or equal + * to INT64_MAX - 2^31 * MLDSA_Q. * - * Arguments: - int64_t a: input integer to be reduced, of absolute value - * smaller or equal to INT64_MAX - 2^31 * MLDSA_Q. - * - * Returns: Integer congruent to a * R^-1 modulo q, with absolute value - * <= |a| / 2^32 + MLDSA_Q / 2 - * - * In particular, if |a| < 2^31 * MLDSA_Q, the absolute value - * of the return value is < MLDSA_Q. - **************************************************/ + * @return Integer congruent to a * R^-1 modulo MLDSA_Q, with absolute value + * <= |a| / 2^32 + MLDSA_Q / 2. + * In particular, if |a| < 2^31 * MLDSA_Q, the absolute value of the + * return value is < MLDSA_Q. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_montgomery_reduce(int64_t a) __contract__( /* We don't attempt to express an input-dependent output bound @@ -86,17 +83,16 @@ __contract__( return (int32_t)r; } -/************************************************* - * Name: mld_reduce32 - * - * Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, - * compute r \equiv a (mod MLDSA_Q) such that - * -MLD_REDUCE32_RANGE_MAX <= r < MLD_REDUCE32_RANGE_MAX. +/** + * For finite field element a with a <= 2^{31} - 2^{22} - 1, compute + * r congruent to a (mod MLDSA_Q) such that + * -MLD_REDUCE32_RANGE_MAX <= r < MLD_REDUCE32_RANGE_MAX. * - * Arguments: - int32_t: finite field element a + * @param a Finite field element. * - * Returns r. - **************************************************/ + * @return r. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_reduce32(int32_t a) __contract__( requires(a <= MLD_REDUCE32_DOMAIN_MAX) @@ -112,22 +108,21 @@ __contract__( return t; } -/************************************************* - * Name: mld_caddq - * - * Description: Add MLDSA_Q if input coefficient is negative. +/** + * Add MLDSA_Q if input coefficient is negative. * - * Arguments: - int32_t: finite field element a + * @param a Finite field element. * - * Returns r. - **************************************************/ + * @return r. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_caddq(int32_t a) __contract__( requires(a > -MLDSA_Q) requires(a < MLDSA_Q) ensures(return_value >= 0) ensures(return_value < MLDSA_Q) - ensures(return_value == (a >= 0) ? a : (a + MLDSA_Q)) + ensures(return_value == ((a >= 0) ? a : (a + MLDSA_Q))) ) { return mld_ct_sel_int32(a + MLDSA_Q, a, mld_ct_cmask_neg_i32(a)); diff --git a/crypto/fipsmodule/ml_dsa/mldsa/rounding.h b/crypto/fipsmodule/ml_dsa/mldsa/rounding.h index a83562b0f1e..21fae3c989e 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/rounding.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/rounding.h @@ -15,7 +15,6 @@ #ifndef MLD_ROUNDING_H #define MLD_ROUNDING_H -#include #include "cbmc.h" #include "common.h" #include "ct.h" @@ -33,20 +32,19 @@ #define MLD_2_POW_D (1 << MLDSA_D) -/************************************************* - * Name: mld_power2round +/** + * For finite field element a, compute a0, a1 such that + * a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with + * -2^{MLDSA_D-1} < a0 <= 2^{MLDSA_D-1}. Assumes a to be standard + * representative. * - * Description: For finite field element a, compute a0, a1 such that - * a mod^+ MLDSA_Q = a1*2^MLDSA_D + a0 with -2^{MLDSA_D-1} < a0 <= - * 2^{MLDSA_D-1}. Assumes a to be standard representative. + * @reference{In the reference implementation, a1 is passed as a return value + * instead.} * - * Arguments: - int32_t a: input element - * - int32_t *a0: pointer to output element a0 - * - int32_t *a1: pointer to output element a1 - * - * Reference: In the reference implementation, a1 is passed as a - * return value instead. - **************************************************/ + * @param[out] a0 Pointer to output element a0. + * @param[out] a1 Pointer to output element a1. + * @param a Input element. + */ static MLD_INLINE void mld_power2round(int32_t *a0, int32_t *a1, int32_t a) __contract__( requires(memory_no_alias(a0, sizeof(int32_t))) @@ -63,22 +61,21 @@ __contract__( *a0 = a - (*a1 << MLDSA_D); } -/************************************************* - * Name: mld_decompose - * - * Description: For finite field element a, compute high and low bits a0, a1 - * such that a mod^+ MLDSA_Q = a1* 2 * MLDSA_GAMMA2 + a0 with - * -MLDSA_GAMMA2 < a0 <= MLDSA_GAMMA2 except - * if a1 = (MLDSA_Q-1)/(MLDSA_GAMMA2*2) where we set a1 = 0 and - * -MLDSA_GAMMA2 <= a0 = a mod^+ MLDSA_Q - MLDSA_Q < 0. - * Assumes a to be standard representative. +/** + * For finite field element a, compute high and low bits a0, a1 such that + * a mod^+ MLDSA_Q = a1 * 2 * MLDSA_GAMMA2 + a0 with + * -MLDSA_GAMMA2 < a0 <= MLDSA_GAMMA2 except if + * a1 = (MLDSA_Q-1)/(MLDSA_GAMMA2*2) where we set a1 = 0 and + * -MLDSA_GAMMA2 <= a0 = a mod^+ MLDSA_Q - MLDSA_Q < 0. Assumes a to be + * standard representative. * - * Arguments: - int32_t a: input element - * - int32_t *a0: pointer to output element a0 - * - int32_t *a1: pointer to output element a1 + * @reference{In the reference implementation, a1 is passed as a return value + * instead.} * - * Reference: a1 is passed as a return value instead - **************************************************/ + * @param[out] a0 Pointer to output element a0. + * @param[out] a1 Pointer to output element a1. + * @param a Input element. + */ static MLD_INLINE void mld_decompose(int32_t *a0, int32_t *a1, int32_t a) __contract__( requires(memory_no_alias(a0, sizeof(int32_t))) @@ -115,10 +112,34 @@ __contract__( #if MLD_CONFIG_PARAMETER_SET == 44 /* check-magic: 1488 == 2 * intdiv(intdiv(MLDSA_Q - 1, 88), 128) */ /* check-magic: 11275 == floor(2**24 / 1488) */ + /* check-magic: 1560281088 == 1 / (1 / 1488 - 11275 / 2**24) */ /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact - * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲ - * 1 / 1488. + * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact for + * 0 <= f1' < 2^16. + * + * To see this, consider the (signed) error f1' * (1 / B - 11275 / 2^24) + * between f1' / B and the (under-)approximation f1' * 11275 / 2^24. Because + * eps := 1 / B - 11275 / 2^24 is 1 / 1560281088 ≈ 2^(-30.54) < 2^(-30), we + * have 0 <= f1' * eps < 2^16 * 2^(-30) = 1 / 2^14 < 1 / 2^11 < 1 / B (note + * that f1' is non-negative). + * + * On the other hand, 1 / B is the spacing between the integral multiples + * of 1 / B, which includes all rounding boundaries n + 0.5 (since B is even). + * Hence, if f1' / B is not of the form n + 0.5, then it is at least 1 / B + * away from the nearest rounding boundary, so moving from f1' / B to + * f1' * 11275 / 2^24 does not affect the rounding result, no matter the type + * of rounding used in either side. In particular, we have round-(f1' / B) = + * round(f1' * 11275 / 2^24) as claimed. + * + * As for the remaining case where f1' / B _is_ of the form n + 0.5, because + * f1' * 11275 / 2^24 is slightly but strictly below f1' / B = n + 0.5 (note + * that f1' and thus the error f1' * eps cannot be 0 here), it is always + * rounded down to n. More precisely, we have round-(f1' / B) = + * round(f1' * 11275 / 2^24), where the round-down on the LHS is essential, + * and on the RHS the type of rounding again does not matter. This concludes + * the proof. + * + * See proofs/isabelle/compress for a formalization of the above argument. */ *a1 = (*a1 * 11275 + (1 << 23)) >> 24; mld_assert(*a1 >= 0 && *a1 <= 44); @@ -128,10 +149,13 @@ __contract__( #else /* MLD_CONFIG_PARAMETER_SET == 44 */ /* check-magic: 4092 == 2 * intdiv(intdiv(MLDSA_Q - 1, 32), 128) */ /* check-magic: 1025 == floor(2**22 / 4092) */ + /* check-magic: 4290772992 == 1 / (1 / 4092 - 1025 / 2**22) */ /* - * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact - * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲ - * 1 / 4092. + * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact for + * 0 <= f1' < 2^16. Following the same argument above, it suffices to show + * that f1' * eps < 1 / B, where eps := 1 / B - 1025 / 2^22. Indeed, we have + * eps = 1 / 4290772992 ≈ 2^(-31.99) < 2^(-31), therefore f1' * eps < + * 2^16 * 2^(-31) = 1 / 2^15 < 1 / 2^12 < 1 / B. */ *a1 = (*a1 * 1025 + (1 << 21)) >> 22; mld_assert(*a1 >= 0 && *a1 <= 16); @@ -146,17 +170,16 @@ __contract__( mld_ct_cmask_neg_i32((MLDSA_Q - 1) / 2 - *a0)); } -/************************************************* - * Name: mld_make_hint - * - * Description: Compute hint bit indicating whether the low bits of the - * input element overflow into the high bits. +/** + * Compute hint bit indicating whether the low bits of the input element + * overflow into the high bits. * - * Arguments: - int32_t a0: low bits of input element - * - int32_t a1: high bits of input element + * @param a0 Low bits of input element. + * @param a1 High bits of input element. * - * Returns 1 if overflow, 0 otherwise - **************************************************/ + * @return 1 if overflow, 0 otherwise. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE unsigned int mld_make_hint(int32_t a0, int32_t a1) __contract__( ensures(return_value >= 0 && return_value <= 1) @@ -171,16 +194,15 @@ __contract__( return 0; } -/************************************************* - * Name: mld_use_hint - * - * Description: Correct high bits according to hint. +/** + * Correct high bits according to hint. * - * Arguments: - int32_t a: input element - * - int32_t hint: hint bit + * @param a Input element. + * @param hint Hint bit. * - * Returns corrected high bits. - **************************************************/ + * @return Corrected high bits. + */ +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int32_t mld_use_hint(int32_t a, int32_t hint) __contract__( requires(hint >= 0 && hint <= 1) diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sign.c b/crypto/fipsmodule/ml_dsa/mldsa/sign.c index 647253438c2..05643e20e56 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/sign.c +++ b/crypto/fipsmodule/ml_dsa/mldsa/sign.c @@ -24,8 +24,7 @@ * https://pq-crystals.org/dilithium/data/dilithium-specification-round3-20210208.pdf */ -#include -#include +#include "sign.h" #include "cbmc.h" #include "ct.h" @@ -35,7 +34,6 @@ #include "poly_kl.h" #include "polyvec.h" #include "randombytes.h" -#include "sign.h" #include "symmetric.h" /* Parameter set namespacing @@ -50,12 +48,12 @@ #define mld_compute_pack_z MLD_ADD_PARAM_SET(mld_compute_pack_z) #define mld_attempt_signature_generation \ MLD_ADD_PARAM_SET(mld_attempt_signature_generation) MLD_CONTEXT_PARAMETERS_8 -#define mld_compute_t0_t1_tr_from_sk_components \ - MLD_ADD_PARAM_SET(mld_compute_t0_t1_tr_from_sk_components) \ - MLD_CONTEXT_PARAMETERS_7 -/* End of parameter set namespacing */ - +#define mld_compute_pack_t0_t1 \ + MLD_ADD_PARAM_SET(mld_compute_pack_t0_t1) MLD_CONTEXT_PARAMETERS_5 +#define mld_get_max_signing_attempts \ + MLD_ADD_PARAM_SET(mld_get_max_signing_attempts) +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) static int mld_check_pct(uint8_t const pk[MLDSA_CRYPTO_PUBLICKEYBYTES], uint8_t const sk[MLDSA_CRYPTO_SECRETKEYBYTES], MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) @@ -63,26 +61,34 @@ __contract__( requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) ensures(return_value == 0 - || return_value == MLD_ERR_FAIL - || return_value == MLD_ERR_OUT_OF_MEMORY - || return_value == MLD_ERR_RNG_FAIL) + || return_value == MLD_ERR_FAIL + || return_value == MLD_ERR_OUT_OF_MEMORY + || return_value == MLD_ERR_RNG_FAIL + || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) ); #if defined(MLD_CONFIG_KEYGEN_PCT) -/************************************************* - * @[FIPS140_3_IG] - * (https://csrc.nist.gov/csrc/media/Projects/cryptographic-module-validation-program/documents/fips%20140-3/FIPS%20140-3%20IG.pdf) +/** + * Pair-wise Consistency Test (PCT) for DSA keypairs. + * + * @[FIPS140_3_IG] TE10.35.02 + * (https://csrc.nist.gov/csrc/media/Projects/cryptographic-module-validation-program/documents/fips%20140-3/FIPS%20140-3%20IG.pdf). + * + * Validates that a generated public/private key pair can correctly sign and + * verify data. Performs signature generation using the private key (sk), + * followed by signature verification using the public key (pk). * - * TE10.35.02: Pair-wise Consistency Test (PCT) for DSA keypairs + * @note @[FIPS204] requires that public/private key pairs are to be used + * only for the calculation and/or verification of digital signatures. * - * Purpose: Validates that a generated public/private key pair can correctly - * sign and verify data. Test performs signature generation using the private - * key (sk), followed by signature verification using the public key (pk). - * Returns 0 if the signature was successfully verified, non-zero if it cannot. + * @param[in] pk Public key. + * @param[in] sk Secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. * - * Note: @[FIPS204] requires that public/private key pairs are to be used only - * for the calculation and/of verification of digital signatures. - **************************************************/ + * @return 0 if the signature was successfully verified, non-zero otherwise. + */ static int mld_check_pct(uint8_t const pk[MLDSA_CRYPTO_PUBLICKEYBYTES], uint8_t const sk[MLDSA_CRYPTO_SECRETKEYBYTES], MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) @@ -197,52 +203,56 @@ __contract__( #endif /* !MLD_CONFIG_SERIAL_FIPS202_ONLY */ } -/************************************************* - * Name: mld_compute_t0_t1_tr_from_sk_components +/** + * Compute t = A*s1hat + s2 row by row, decompose each row into t0[k] and + * t1[k] via power2round, and bit-pack t1[k] into pk_t1 and t0[k] into the + * t0_packed buffer. Used by both keygen and pk_from_sk. * - * Description: Computes t0, t1, tr, and pk from secret key components - * rho, s1, s2. This is the shared computation used by - * both keygen and generating the public key from the - * secret key. + * @param[out] pk_t1 Output buffer for packed t1 (size + * MLDSA_K * MLDSA_POLYT1_PACKEDBYTES; i.e. the t1 + * region of pk). + * @param[out] t0_packed Output buffer for packed t0 (size + * MLDSA_K * MLDSA_POLYT0_PACKEDBYTES). + * @param[in] s1hat s1 in NTT domain. + * @param[in] s2 s2. + * @param[in] rho Byte array containing seed rho. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. * - * Arguments: - mld_polyveck *t0: output t0 - * - mld_polyveck *t1: output t1 - * - uint8_t tr[MLDSA_TRBYTES]: output tr - * - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: output public key - * - const uint8_t rho[MLDSA_SEEDBYTES]: input rho - * - const mld_polyvecl *s1: input s1 - * - const mld_polyveck *s2: input s2 - **************************************************/ + * @return - 0: Success. + * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is used and + * an allocation via MLD_CUSTOM_ALLOC returned NULL. + */ MLD_MUST_CHECK_RETURN_VALUE -static int mld_compute_t0_t1_tr_from_sk_components( - mld_polyveck *t0, mld_polyveck *t1, uint8_t tr[MLDSA_TRBYTES], - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[MLDSA_SEEDBYTES], - const mld_polyvecl *s1, const mld_polyveck *s2, +static int mld_compute_pack_t0_t1( + uint8_t pk_t1[MLDSA_K * MLDSA_POLYT1_PACKEDBYTES], + uint8_t t0_packed[MLDSA_K * MLDSA_POLYT0_PACKEDBYTES], + const mld_polyvecl *s1hat, const mld_polyveck *s2, + const uint8_t rho[MLDSA_SEEDBYTES], MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) __contract__( - requires(memory_no_alias(t0, sizeof(mld_polyveck))) - requires(memory_no_alias(t1, sizeof(mld_polyveck))) - requires(memory_no_alias(tr, MLDSA_TRBYTES)) - requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) - requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) - requires(memory_no_alias(s1, sizeof(mld_polyvecl))) + requires(memory_no_alias(pk_t1, MLDSA_K * MLDSA_POLYT1_PACKEDBYTES)) + requires(memory_no_alias(t0_packed, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) + requires(memory_no_alias(s1hat, sizeof(mld_polyvecl))) requires(memory_no_alias(s2, sizeof(mld_polyveck))) - requires(forall(l0, 0, MLDSA_L, array_bound(s1->vec[l0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) - requires(forall(k0, 0, MLDSA_K, array_bound(s2->vec[k0].coeffs, 0, MLDSA_N, MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) - assigns(memory_slice(t0, sizeof(mld_polyveck))) - assigns(memory_slice(t1, sizeof(mld_polyveck))) - assigns(memory_slice(tr, MLDSA_TRBYTES)) - assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) - ensures(forall(k1, 0, MLDSA_K, array_bound(t0->vec[k1].coeffs, 0, MLDSA_N, -(1<<(MLDSA_D-1)) + 1, (1<<(MLDSA_D-1)) + 1))) - ensures(forall(k2, 0, MLDSA_K, array_bound(t1->vec[k2].coeffs, 0, MLDSA_N, 0, 1 << 10))) + requires(memory_no_alias(rho, MLDSA_SEEDBYTES)) + requires(forall(l1, 0, MLDSA_L, + array_abs_bound(s1hat->vec[l1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + requires(forall(k2, 0, MLDSA_K, + array_bound(s2->vec[k2].coeffs, 0, MLDSA_N, + MLD_POLYETA_UNPACK_LOWER_BOUND, MLDSA_ETA + 1))) + assigns(memory_slice(pk_t1, MLDSA_K * MLDSA_POLYT1_PACKEDBYTES)) + assigns(memory_slice(t0_packed, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) ensures(return_value == 0 || return_value == MLD_ERR_OUT_OF_MEMORY)) { + unsigned int k; int ret; MLD_ALLOC(mat, mld_polymat, 1, context); - MLD_ALLOC(s1hat, mld_polyvecl, 1, context); - MLD_ALLOC(t, mld_polyveck, 1, context); + MLD_ALLOC(t0k, mld_poly, 1, context); + MLD_ALLOC(t1k, mld_poly, 1, context); - if (mat == NULL || s1hat == NULL || t == NULL) + if (mat == NULL || t0k == NULL || t1k == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; @@ -251,40 +261,52 @@ __contract__( /* Expand matrix */ mld_polyvec_matrix_expand(mat, rho); - /* Matrix-vector multiplication */ - *s1hat = *s1; - mld_polyvecl_ntt(s1hat); - mld_polyvec_matrix_pointwise_montgomery(t, mat, s1hat); - mld_polyveck_invntt_tomont(t); - - /* Add error vector s2 */ - mld_polyveck_add(t, s2); - - /* Reference: The following reduction is not present in the reference - * implementation. Omitting this reduction requires the output of - * the invntt to be small enough such that the addition of s2 does - * not result in absolute values >= MLDSA_Q. While our C, x86_64, - * and AArch64 invntt implementations produce small enough - * values for this to work out, it complicates the bounds - * reasoning. We instead add an additional reduction, and can - * consequently, relax the bounds requirements for the invntt. - */ - mld_polyveck_reduce(t); - - /* Decompose to get t1, t0 */ - mld_polyveck_caddq(t); - mld_polyveck_power2round(t1, t0, t); - - /* Pack public key and compute tr */ - mld_pack_pk(pk, rho, t1); - mld_shake256(tr, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES); + for (k = 0; k < MLDSA_K; k++) + __loop__( + assigns(k, memory_slice(pk_t1, MLDSA_K * MLDSA_POLYT1_PACKEDBYTES), + memory_slice(t0_packed, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES), + memory_slice(t0k, sizeof(mld_poly)), + memory_slice(t1k, sizeof(mld_poly)) + MLD_IF_REDUCE_RAM(, memory_slice(mat, sizeof(mld_polymat)))) + invariant(k <= MLDSA_K) + decreases(MLDSA_K - k) + ) + { + /* t0k = (A * s1hat)_k in NTT domain */ + mld_polyvec_matrix_pointwise_montgomery_row(t0k, mat, s1hat, k); + + /* t0k = invNTT(t0k) */ + mld_poly_invntt_tomont(t0k); + + /* t0k += s2[k] */ + mld_poly_add(t0k, &s2->vec[k]); + + /* Reference: The following reduction is not present in the reference + * implementation. Omitting this reduction requires the output + * of the invntt to be small enough such that the addition of + * s2 does not result in absolute values >= MLDSA_Q. While our + * C, x86_64, and AArch64 invntt implementations produce small + * enough values for this to work out, it complicates the + * bounds reasoning. We instead add an additional reduction, + * and can consequently, relax the bounds requirements for the + * invntt. + */ + mld_poly_reduce(t0k); + + /* Decompose into t1[k] and t0[k] (in place into t0k). */ + mld_poly_caddq(t0k); + mld_poly_power2round(t1k, t0k, t0k); + + /* Pack t1[k] into pk and t0[k] into the t0 output buffer. */ + mld_polyt1_pack(pk_t1 + k * MLDSA_POLYT1_PACKEDBYTES, t1k); + mld_polyt0_pack(t0_packed + k * MLDSA_POLYT0_PACKEDBYTES, t0k); + } ret = 0; - cleanup: /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - MLD_FREE(t, mld_polyveck, 1, context); - MLD_FREE(s1hat, mld_polyvecl, 1, context); + MLD_FREE(t1k, mld_poly, 1, context); + MLD_FREE(t0k, mld_poly, 1, context); MLD_FREE(mat, mld_polymat, 1, context); return ret; } @@ -298,16 +320,15 @@ int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], { int ret; const uint8_t *rho, *rhoprime, *key; + MLD_ALLOC(seedbuf, uint8_t, 2 * MLDSA_SEEDBYTES + MLDSA_CRHBYTES, context); MLD_ALLOC(inbuf, uint8_t, MLDSA_SEEDBYTES + 2, context); MLD_ALLOC(tr, uint8_t, MLDSA_TRBYTES, context); MLD_ALLOC(s1, mld_polyvecl, 1, context); MLD_ALLOC(s2, mld_polyveck, 1, context); - MLD_ALLOC(t1, mld_polyveck, 1, context); - MLD_ALLOC(t0, mld_polyveck, 1, context); if (seedbuf == NULL || inbuf == NULL || tr == NULL || s1 == NULL || - s2 == NULL || t1 == NULL || t0 == NULL) + s2 == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; @@ -329,24 +350,38 @@ int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], /* Sample s1 and s2 */ mld_sample_s1_s2(s1, s2, rhoprime); - /* Compute t0, t1, tr, and pk from rho, s1, s2 */ - ret = mld_compute_t0_t1_tr_from_sk_components(t0, t1, tr, pk, rho, s1, s2, - context); + /* Pack s1 into sk before NTT */ + mld_pack_sk_s1(sk, s1); + + /* NTT s1 in place to use as s1hat */ + mld_polyvecl_ntt(s1); + + /* Pack rho into pk */ + mld_memcpy(pk, rho, MLDSA_SEEDBYTES); + + /* Compute t = A*s1hat + s2 row by row, decompose into t1/t0, and pack + * t1 into pk and t0 directly into the t0 region of sk. */ + ret = mld_compute_pack_t0_t1(pk + MLDSA_SEEDBYTES, + sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES + + MLDSA_K * MLDSA_POLYETA_PACKEDBYTES, + s1, s2, rho, context); if (ret != 0) { goto cleanup; } - /* Pack secret key */ - mld_pack_sk(sk, rho, tr, key, t0, s1, s2); + /* Compute tr = H(pk) */ + mld_shake256(tr, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES); + + /* Pack remaining secret key components (s1 and t0 already packed) */ + mld_pack_sk_rho_key_tr_s2(sk, rho, tr, key, s2); /* Constant time: pk is the public key, inherently public data */ MLD_CT_TESTING_DECLASSIFY(pk, MLDSA_CRYPTO_PUBLICKEYBYTES); cleanup: /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - MLD_FREE(t0, mld_polyveck, 1, context); - MLD_FREE(t1, mld_polyveck, 1, context); MLD_FREE(s2, mld_polyveck, 1, context); MLD_FREE(s1, mld_polyvecl, 1, context); MLD_FREE(tr, uint8_t, MLDSA_TRBYTES, context); @@ -363,6 +398,7 @@ int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], return mld_check_pct(pk, sk, context); } +#if !defined(MLD_CONFIG_CORE_API_ONLY) #if !defined(MLD_CONFIG_NO_RANDOMIZED_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API @@ -370,45 +406,48 @@ int mld_sign_keypair(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) { - MLD_ALIGN uint8_t seed[MLDSA_SEEDBYTES]; int ret; + MLD_ALLOC(seed, uint8_t, MLDSA_SEEDBYTES, context); + + if (seed == NULL) + { + ret = MLD_ERR_OUT_OF_MEMORY; + goto cleanup; + } + if (mld_randombytes(seed, MLDSA_SEEDBYTES) != 0) { ret = MLD_ERR_RNG_FAIL; goto cleanup; } - MLD_CT_TESTING_SECRET(seed, sizeof(seed)); + MLD_CT_TESTING_SECRET(seed, MLDSA_SEEDBYTES); ret = mld_sign_keypair_internal(pk, sk, seed, context); cleanup: /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - mld_zeroize(seed, sizeof(seed)); + MLD_FREE(seed, uint8_t, MLDSA_SEEDBYTES, context); return ret; } #endif /* !MLD_CONFIG_NO_RANDOMIZED_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ -/************************************************* - * Name: mld_H +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * Abstracts application of SHAKE256 to one, two or three blocks of data, + * yielding a user-requested size of output. * - * Description: Abstracts application of SHAKE256 to - * one, two or three blocks of data, - * yielding a user-requested size of - * output. - * - * Arguments: - uint8_t *out: pointer to output - * - size_t outlen: requested output length in bytes - * - const uint8_t *in1: pointer to input block 1 - * Must NOT be NULL - * - size_t in1len: length of input in1 bytes - * - const uint8_t *in2: pointer to input block 2 - * May be NULL if in2len=0, in which case - * this block is ignored - * - size_t in2len: length of input in2 bytes - * - const uint8_t *in3: pointer to input block 3 - * May be NULL if in3len=0, in which case - * this block is ignored - * - size_t in3len: length of input in3 bytes - **************************************************/ + * @param[out] out Pointer to output. + * @param outlen Requested output length in bytes. + * @param[in] in1 Pointer to input block 1. Must NOT be NULL. + * @param in1len Length of input in1 in bytes. + * @param[in] in2 Pointer to input block 2. May be NULL if in2len == 0, + * in which case this block is ignored. + * @param in2len Length of input in2 in bytes. + * @param[in] in3 Pointer to input block 3. May be NULL if in3len == 0, + * in which case this block is ignored. + * @param in3len Length of input in3 in bytes. + */ static void mld_H(uint8_t *out, size_t outlen, const uint8_t *in1, size_t in1len, const uint8_t *in2, size_t in2len, const uint8_t *in3, size_t in3len) @@ -442,44 +481,62 @@ __contract__( /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ mld_zeroize(&state, sizeof(state)); } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Name: mld_compute_pack_z - * - * Description: Computes z = y + s1*c, checks that z has coefficients smaller - * than MLDSA_GAMMA1 - MLDSA_BETA, and packs z into the - * signature buffer. +#if !defined(MLD_CONFIG_NO_SIGN_API) +/* Reference: The reference implementation does not explicitly check the + * maximum nonce value, but instead loops indefinitely (even when the nonce + * would overflow). Internally, sampling of y uses + * (nonce*L), (nonce*L+1), ..., (nonce*L + L - 1). + * Hence, there are no overflows if nonce < (UINT16_MAX - L)/L. + * Explicitly checking for this explicitly allows us to prove type-safety. */ +#define MLD_NONCE_UB ((UINT16_MAX - MLDSA_L) / MLDSA_L) + +/** + * Compute z = y + s1*c, check that z has coefficients smaller than + * MLDSA_GAMMA1 - MLDSA_BETA, and pack z into the signature buffer. * - * Arguments: - uint8_t *sig: output signature - * - const mld_poly *cp: challenge polynomial - * - const polyvecl *s1: secret vector s1 - * - const polyvecl *y: masking vector y + * @reference{This function is inlined into mld_sign_signature in the + * reference implementation.} * - * Returns: - 0: Success (z has coefficients smaller than - * MLDSA_GAMMA1 - MLDSA_BETA,) - * - MLD_ERR_FAIL: z rejected (norm check failed) - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. + * @param[in,out] sig Output signature. + * @param[in] cp Challenge polynomial. + * @param[in] s1hat Secret vector s1 in NTT domain. + * @param[in] y Masking vector y (or seed in REDUCE_RAM mode). + * @param[out] z Scratch polynomial for z computation. + * @param[out] tmp Scratch polynomial. * - * Reference: This function is inlined into mld_sign_signature in the - * reference implementation. - **************************************************/ + * @return - 0: Success (z has coefficients smaller than + * MLDSA_GAMMA1 - MLDSA_BETA). + * - MLD_ERR_FAIL: z rejected (norm check failed). + * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is used and + * an allocation via MLD_CUSTOM_ALLOC returned NULL. + */ MLD_MUST_CHECK_RETURN_VALUE static int mld_compute_pack_z(uint8_t sig[MLDSA_CRYPTO_BYTES], - const mld_poly *cp, const mld_polyvecl *s1, - const mld_polyvecl *y, mld_poly *z) + const mld_poly *cp, const mld_sk_s1hat *s1hat, + const mld_yvec *y, mld_poly *z, mld_poly *tmp) __contract__( requires(memory_no_alias(sig, MLDSA_CRYPTO_BYTES)) requires(memory_no_alias(cp, sizeof(mld_poly))) - requires(memory_no_alias(s1, sizeof(mld_polyvecl))) - requires(memory_no_alias(y, sizeof(mld_polyvecl))) + requires(memory_no_alias(s1hat, sizeof(mld_sk_s1hat))) + requires(memory_no_alias(y, sizeof(mld_yvec))) requires(memory_no_alias(z, sizeof(mld_poly))) + requires(memory_no_alias(tmp, sizeof(mld_poly))) requires(array_abs_bound(cp->coeffs, 0, MLDSA_N, MLD_NTT_BOUND)) - requires(forall(k0, 0, MLDSA_L, - array_bound(y->vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) - requires(forall(k1, 0, MLDSA_L, array_abs_bound(s1->vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + MLD_IF_NOT_REDUCE_RAM( + requires(forall(k0, 0, MLDSA_L, + array_bound(y->vec.vec[k0].coeffs, 0, MLDSA_N, -(MLDSA_GAMMA1 - 1), MLDSA_GAMMA1 + 1))) + requires(forall(k1, 0, MLDSA_L, array_abs_bound(s1hat->vec.vec[k1].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ) + MLD_IF_REDUCE_RAM( + requires(memory_no_alias(s1hat->packed, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) + requires(memory_no_alias(y->rhoprime, MLDSA_CRHBYTES)) + requires(y->nonce <= MLD_NONCE_UB) + ) assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(memory_slice(z, sizeof(mld_poly))) + assigns(memory_slice(tmp, sizeof(mld_poly))) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ) @@ -488,13 +545,18 @@ __contract__( uint32_t z_invalid; for (i = 0; i < MLDSA_L; i++) __loop__( - assigns(i, memory_slice(z, sizeof(mld_poly)), memory_slice(sig, MLDSA_CRYPTO_BYTES)) + assigns(i, memory_slice(z, sizeof(mld_poly)), + memory_slice(tmp, sizeof(mld_poly)), + memory_slice(sig, MLDSA_CRYPTO_BYTES)) invariant(i <= MLDSA_L) + decreases(MLDSA_L - i) ) { - mld_poly_pointwise_montgomery(z, cp, &s1->vec[i]); + mld_sk_s1hat_get_poly(z, s1hat, i); + mld_poly_pointwise_montgomery(z, cp); mld_poly_invntt_tomont(z); - mld_poly_add(z, &y->vec[i]); + mld_yvec_get_poly(tmp, y, i); + mld_poly_add(z, tmp); mld_poly_reduce(z); z_invalid = mld_poly_chknorm(z, MLDSA_GAMMA1 - MLDSA_BETA); @@ -522,118 +584,133 @@ __contract__( return 0; } -/* Reference: The reference implementation does not explicitly check the - * maximum nonce value, but instead loops indefinitely (even when the nonce - * would overflow). Internally, sampling of y uses - * (nonceL), (nonceL+1), ... (nonce*L+L-1). - * Hence, there are no overflows if nonce < (UINT16_MAX - L)/L. - * Explicitly checking for this explicitly allows us to prove type-safety. - * Note that FIPS204 explicitly allows an upper-bound this loop of - * 814 (< (UINT16_MAX - L)/L) - see @[FIPS204, Appendix C]. */ -#define MLD_NONCE_UB ((UINT16_MAX - MLDSA_L) / MLDSA_L) +/* User-facing bound on signing attempts. See MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * in mldsa_native_config.h. Default is chosen so that failure probability + * is < 2^{-256}, that is, signatures will practically always succeed. */ +#ifndef MLD_CONFIG_MAX_SIGNING_ATTEMPTS +#define MLD_CONFIG_MAX_SIGNING_ATTEMPTS MLD_NONCE_UB +#endif -/************************************************* - * Name: attempt_signature_generation - * - * Description: Attempts to generate a single signature. +#if !defined(MLD_ALLOW_NONCOMPLIANT_SIGNING_BOUND) && \ + MLD_CONFIG_MAX_SIGNING_ATTEMPTS < 814 +#error Bad configuration: MLD_CONFIG_MAX_SIGNING_ATTEMPTS must be >= 814 for FIPS 204 compliance @[FIPS204, Appendix C] +#endif + +#if MLD_CONFIG_MAX_SIGNING_ATTEMPTS < 1 +#error Bad configuration: MLD_CONFIG_MAX_SIGNING_ATTEMPTS must be >= 1 +#endif + +#if MLD_CONFIG_MAX_SIGNING_ATTEMPTS > MLD_NONCE_UB +#error Bad configuration: MLD_CONFIG_MAX_SIGNING_ATTEMPTS exceeds the maximum allowed value. +#endif + +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE uint16_t mld_get_max_signing_attempts(void) +__contract__( + ensures(return_value >= 1) + ensures(return_value <= MLD_NONCE_UB) +) +{ + /* cassert(0) ensures CBMC uses the contract rather than inlining the body, + * keeping proofs agnostic of the configured value. */ + cassert(0); + return MLD_CONFIG_MAX_SIGNING_ATTEMPTS; +} + +/** + * Attempt to generate a single signature. * - * Arguments: - uint8_t *sig: pointer to output signature - * - const uint8_t *mu: pointer to message or hash - * of exactly MLDSA_CRHBYTES bytes - * - const uint8_t *rhoprime: pointer to randomness seed - * - uint16_t nonce: current nonce value - * - const mld_polymat *mat: expanded matrix - * - const polyvecl *s1: secret vector s1 - * - const polyveck *s2: secret vector s2 - * - const polyveck *t0: vector t0 + * @reference{This code differs from the reference implementation in that it + * factors out the core signature generation step into a distinct function + * here in order to improve efficiency of CBMC proof.} * - * Returns: - 0: Signature generation succeeded - * - MLD_ERR_FAIL: Signature rejected (norm check failed) - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. + * @param[out] sig Pointer to output signature. + * @param[in] mu Pointer to message or hash of exactly MLDSA_CRHBYTES + * bytes. + * @param[in] rhoprime Pointer to randomness seed. + * @param nonce Current nonce value. + * @param[in] mat Expanded matrix. + * @param[in] s1hat Secret vector s1 in NTT domain. + * @param[in] s2hat Secret vector s2 in NTT domain. + * @param[in] t0hat Vector t0 in NTT domain. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. * - * Reference: This code differs from the reference implementation - * in that it factors out the core signature generation - * step into a distinct function here in order to improve - * efficiency of CBMC proof. - **************************************************/ + * @return - 0: Signature generation succeeded. + * - MLD_ERR_FAIL: Signature rejected (norm check failed). + * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is used and + * an allocation via MLD_CUSTOM_ALLOC returned NULL. + */ MLD_MUST_CHECK_RETURN_VALUE static int mld_attempt_signature_generation( uint8_t sig[MLDSA_CRYPTO_BYTES], const uint8_t *mu, const uint8_t rhoprime[MLDSA_CRHBYTES], uint16_t nonce, mld_polymat *mat, - const mld_polyvecl *s1, const mld_polyveck *s2, const mld_polyveck *t0, - MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) + const mld_sk_s1hat *s1hat, const mld_sk_s2hat *s2hat, + const mld_sk_t0hat *t0hat, MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) __contract__( requires(memory_no_alias(sig, MLDSA_CRYPTO_BYTES)) requires(memory_no_alias(mu, MLDSA_CRHBYTES)) requires(memory_no_alias(rhoprime, MLDSA_CRHBYTES)) requires(memory_no_alias(mat, sizeof(mld_polymat))) - requires(memory_no_alias(s1, sizeof(mld_polyvecl))) - requires(memory_no_alias(s2, sizeof(mld_polyveck))) - requires(memory_no_alias(t0, sizeof(mld_polyveck))) + requires(memory_no_alias(s1hat, sizeof(mld_sk_s1hat))) + requires(memory_no_alias(s2hat, sizeof(mld_sk_s2hat))) + requires(memory_no_alias(t0hat, sizeof(mld_sk_t0hat))) requires(nonce <= MLD_NONCE_UB) - requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - requires(forall(k2, 0, MLDSA_K, array_abs_bound(t0->vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - requires(forall(k3, 0, MLDSA_L, array_abs_bound(s1->vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - requires(forall(k4, 0, MLDSA_K, array_abs_bound(s2->vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + MLD_IF_NOT_REDUCE_RAM( + requires(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + requires(forall(k2, 0, MLDSA_K, array_abs_bound(t0hat->vec.vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + requires(forall(k3, 0, MLDSA_L, array_abs_bound(s1hat->vec.vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + requires(forall(k4, 0, MLDSA_K, array_abs_bound(s2hat->vec.vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ) + MLD_IF_REDUCE_RAM( + requires(memory_no_alias(s1hat->packed, MLDSA_L * MLDSA_POLYETA_PACKEDBYTES)) + requires(memory_no_alias(s2hat->packed, MLDSA_K * MLDSA_POLYETA_PACKEDBYTES)) + requires(memory_no_alias(t0hat->packed, MLDSA_K * MLDSA_POLYT0_PACKEDBYTES)) + ) assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) + MLD_IF_REDUCE_RAM( + assigns(memory_slice(mat, sizeof(mld_polymat))) + ) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ) { - unsigned int n; + unsigned int k; uint32_t w0_invalid, h_invalid; int ret; - /* TODO: Remove the following workaround for - * https://github.com/diffblue/cbmc/issues/8813 */ - typedef MLK_UNION_OR_STRUCT - { - mld_polyvecl y; - mld_polyveck h; - } - yh_u; - mld_polyvecl *y; - mld_polyveck *h; - /* TODO: Remove the following workaround for - * https://github.com/diffblue/cbmc/issues/8813 */ - typedef MLK_UNION_OR_STRUCT + typedef union { mld_polyveck w1; mld_polyvecl tmp; - } - w1tmp_u; + } w1tmp_u; mld_polyveck *w1; mld_polyvecl *tmp; MLD_ALLOC(challenge_bytes, uint8_t, MLDSA_CTILDEBYTES, context); - MLD_ALLOC(yh, yh_u, 1, context); + MLD_ALLOC(y, mld_yvec, 1, context); MLD_ALLOC(z, mld_poly, 1, context); MLD_ALLOC(w1tmp, w1tmp_u, 1, context); MLD_ALLOC(w0, mld_polyveck, 1, context); MLD_ALLOC(cp, mld_poly, 1, context); MLD_ALLOC(t, mld_poly, 1, context); - if (challenge_bytes == NULL || yh == NULL || z == NULL || w1tmp == NULL || + if (challenge_bytes == NULL || y == NULL || z == NULL || w1tmp == NULL || w0 == NULL || cp == NULL || t == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; } - y = &yh->y; - h = &yh->h; w1 = &w1tmp->w1; tmp = &w1tmp->tmp; - /* Sample intermediate vector y */ - mld_polyvecl_uniform_gamma1(y, rhoprime, nonce); + /* Sample/initialize intermediate vector y */ + mld_yvec_init(y, rhoprime, nonce); - /* Matrix-vector multiplication */ - *tmp = *y; - mld_polyvecl_ntt(tmp); - mld_polyvec_matrix_pointwise_montgomery(w0, mat, tmp); - mld_polyveck_invntt_tomont(w0); + /* Matrix-vector multiplication, fused with y sampling in REDUCE_RAM mode */ + mld_polyvec_matrix_pointwise_montgomery_yvec(w0, mat, y, tmp); /* Decompose w and call the random oracle */ mld_polyveck_caddq(w0); @@ -651,63 +728,84 @@ __contract__( mld_poly_ntt(cp); /* Compute z, reject if it reveals secret */ - ret = mld_compute_pack_z(sig, cp, s1, y, t); - if (ret) + ret = mld_compute_pack_z(sig, cp, s1hat, y, t, z); + if (ret != 0) { goto cleanup; } - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - mld_polyveck_pointwise_poly_montgomery(h, cp, s2); - mld_polyveck_invntt_tomont(h); - mld_polyveck_sub(w0, h); - mld_polyveck_reduce(w0); - - w0_invalid = mld_polyveck_chknorm(w0, MLDSA_GAMMA2 - MLDSA_BETA); - /* Constant time: w0_invalid may be leaked - see comment for z_invalid. */ - MLD_CT_TESTING_DECLASSIFY(&w0_invalid, sizeof(uint32_t)); - if (w0_invalid) + /* Compute w0 - cs2 + ct0 per-component, checking norms incrementally. + * This avoids allocating a full polyveck for h. */ + for (k = 0; k < MLDSA_K; k++) + __loop__( + assigns(k, + object_whole(z), + object_whole(w0)) + invariant(k <= MLDSA_K) + invariant(forall(k0, k, MLDSA_K, + array_abs_bound(w0->vec[k0].coeffs, 0, MLDSA_N, MLDSA_GAMMA2 + 1))) + decreases(MLDSA_K - k) + ) { - ret = MLD_ERR_FAIL; /* reject */ - goto cleanup; - } + /* Compute cs2[k] and subtract from w0[k] */ + mld_sk_s2hat_get_poly(z, s2hat, k); + mld_poly_pointwise_montgomery(z, cp); + mld_poly_invntt_tomont(z); - /* Compute hints for w1 */ - mld_polyveck_pointwise_poly_montgomery(h, cp, t0); - mld_polyveck_invntt_tomont(h); - mld_polyveck_reduce(h); + mld_poly_sub(&w0->vec[k], z); + mld_poly_reduce(&w0->vec[k]); - h_invalid = mld_polyveck_chknorm(h, MLDSA_GAMMA2); - /* Constant time: h_invalid may be leaked - see comment for z_invalid. */ - MLD_CT_TESTING_DECLASSIFY(&h_invalid, sizeof(uint32_t)); - if (h_invalid) - { - ret = MLD_ERR_FAIL; /* reject */ - goto cleanup; - } + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + w0_invalid = mld_poly_chknorm(&w0->vec[k], MLDSA_GAMMA2 - MLDSA_BETA); + /* Constant time: w0_invalid may be leaked - see comment for z_invalid. */ + MLD_CT_TESTING_DECLASSIFY(&w0_invalid, sizeof(uint32_t)); + if (w0_invalid) + { + ret = MLD_ERR_FAIL; /* reject */ + goto cleanup; + } - mld_polyveck_add(w0, h); + /* Compute ct0[k], check norm, and add to w0[k] */ + mld_sk_t0hat_get_poly(z, t0hat, k); + mld_poly_pointwise_montgomery(z, cp); + mld_poly_invntt_tomont(z); + mld_poly_reduce(z); + + h_invalid = mld_poly_chknorm(z, MLDSA_GAMMA2); + /* Constant time: h_invalid may be leaked - see comment for z_invalid. */ + MLD_CT_TESTING_DECLASSIFY(&h_invalid, sizeof(uint32_t)); + if (h_invalid) + { + ret = MLD_ERR_FAIL; /* reject */ + goto cleanup; + } + + mld_poly_add(&w0->vec[k], z); + } /* Constant time: At this point all norm checks have passed and we, hence, * know that the signature does not leak any secret information. * Consequently, any value that can be computed from the signature and public * key is considered public. * w0 and w1 are public as they can be computed from Az - ct = \alpha w1 + w0. - * h=c*t0 is public as both c and t0 are public. - * For a more detailed discussion, refer to https://eprint.iacr.org/2022/1406. + * h=c*t0 is public as both c and t0 are considered public. + * While t0 is not part of the public key, it can be reconstructed from + * a small number of signatures and need not be regarded as secret + * (see @[FIPS204, Section 6.1]). */ MLD_CT_TESTING_DECLASSIFY(w0, sizeof(*w0)); MLD_CT_TESTING_DECLASSIFY(w1, sizeof(*w1)); - n = mld_polyveck_make_hint(h, w0, w1); - if (n > MLDSA_OMEGA) + + /* Pack challenge bytes and hints. */ + mld_pack_sig_c(sig, challenge_bytes); + + ret = mld_pack_sig_h(sig, w0, w1); + if (ret != 0) { - ret = MLD_ERR_FAIL; /* reject */ goto cleanup; } - /* All is well - write signature */ - mld_pack_sig_c_h(sig, challenge_bytes, h, n); /* Constant time: At this point it is clear that the signature is valid - it * can, hence, be considered public. */ MLD_CT_TESTING_DECLASSIFY(sig, MLDSA_CRYPTO_BYTES); @@ -720,7 +818,7 @@ __contract__( MLD_FREE(w0, mld_polyveck, 1, context); MLD_FREE(w1tmp, w1tmp_u, 1, context); MLD_FREE(z, mld_poly, 1, context); - MLD_FREE(yh, yh_u, 1, context); + MLD_FREE(y, mld_yvec, 1, context); MLD_FREE(challenge_bytes, uint8_t, MLDSA_CTILDEBYTES, context); return ret; @@ -738,14 +836,16 @@ int mld_sign_signature_internal(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, int ret; uint8_t *rho, *tr, *key, *mu, *rhoprime; uint16_t nonce = 0; + const uint16_t nonce_limit = mld_get_max_signing_attempts(); MLD_ALLOC(seedbuf, uint8_t, 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + 2 * MLDSA_CRHBYTES, context); MLD_ALLOC(mat, mld_polymat, 1, context); - MLD_ALLOC(s1, mld_polyvecl, 1, context); - MLD_ALLOC(t0, mld_polyveck, 1, context); - MLD_ALLOC(s2, mld_polyveck, 1, context); + MLD_ALLOC(s1hat, mld_sk_s1hat, 1, context); + MLD_ALLOC(t0hat, mld_sk_t0hat, 1, context); + MLD_ALLOC(s2hat, mld_sk_s2hat, 1, context); - if (seedbuf == NULL || mat == NULL || s1 == NULL || t0 == NULL || s2 == NULL) + if (seedbuf == NULL || mat == NULL || s1hat == NULL || t0hat == NULL || + s2hat == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; @@ -756,7 +856,7 @@ int mld_sign_signature_internal(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, key = tr + MLDSA_TRBYTES; mu = key + MLDSA_SEEDBYTES; rhoprime = mu + MLDSA_CRHBYTES; - mld_unpack_sk(rho, tr, key, t0, s1, s2, sk); + mld_unpack_sk(rho, tr, key, t0hat, s1hat, s2hat, sk); if (!externalmu) { @@ -777,46 +877,46 @@ int mld_sign_signature_internal(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, MLD_CT_TESTING_DECLASSIFY(rho, MLDSA_SEEDBYTES); /* Expand matrix and transform vectors */ mld_polyvec_matrix_expand(mat, rho); - mld_polyvecl_ntt(s1); - mld_polyveck_ntt(s2); - mld_polyveck_ntt(t0); - - /* By default, return failure. Flip to success and write output - * once signature generation succeeds. */ - ret = MLD_ERR_FAIL; /* Reference: This code is re-structured using a while(1), */ /* with explicit "continue" statements (rather than "goto") */ /* to implement rejection of invalid signatures. */ while (1) __loop__( - assigns(nonce, ret, object_whole(siglen), memory_slice(sig, MLDSA_CRYPTO_BYTES)) - invariant(nonce <= MLD_NONCE_UB) + MLD_IF_NOT_REDUCE_RAM( + assigns(nonce, ret, object_whole(siglen), memory_slice(sig, MLDSA_CRYPTO_BYTES)) + ) + MLD_IF_REDUCE_RAM( + assigns(nonce, ret, object_whole(siglen), memory_slice(sig, MLDSA_CRYPTO_BYTES), + memory_slice(mat, sizeof(mld_polymat))) + ) + invariant(nonce <= nonce_limit) /* t0, s1, s2, and mat are initialized above and are NOT changed by this */ /* loop. We can therefore re-assert their bounds here as part of the */ /* loop invariant. This makes proof noticeably faster with CBMC */ - invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, - array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) - invariant(forall(k2, 0, MLDSA_K, array_abs_bound(t0->vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - invariant(forall(k3, 0, MLDSA_L, array_abs_bound(s1->vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - invariant(forall(k4, 0, MLDSA_K, array_abs_bound(s2->vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) - invariant(ret == MLD_ERR_FAIL) + MLD_IF_NOT_REDUCE_RAM( + invariant(forall(k1, 0, MLDSA_K, forall(l1, 0, MLDSA_L, + array_bound(mat->vec[k1].vec[l1].coeffs, 0, MLDSA_N, 0, MLDSA_Q)))) + invariant(forall(k2, 0, MLDSA_K, array_abs_bound(t0hat->vec.vec[k2].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + invariant(forall(k3, 0, MLDSA_L, array_abs_bound(s1hat->vec.vec[k3].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + invariant(forall(k4, 0, MLDSA_K, array_abs_bound(s2hat->vec.vec[k4].coeffs, 0, MLDSA_N, MLD_NTT_BOUND))) + ) + decreases(nonce_limit - nonce) ) { - /* Reference: this code explicitly checks for exhaustion of nonce */ - /* values to provide predictable termination and results in that case */ - /* Checking here also means that incrementing nonce below can also */ - /* be proven to be type-safe. */ - if (nonce == MLD_NONCE_UB) + /* Reference: this code explicitly checks for exhaustion of signing */ + /* attempts to provide predictable termination and results in that */ + /* case. Checking here also means that incrementing nonce below can */ + /* be proven to be type-safe. */ + if (nonce == nonce_limit) { - /* Note that ret == MLD_ERR_FAIL by default, so we - * don't need to set it here. */ + ret = MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED; break; } - ret = mld_attempt_signature_generation(sig, mu, rhoprime, nonce, mat, s1, - s2, t0, context); + ret = mld_attempt_signature_generation(sig, mu, rhoprime, nonce, mat, s1hat, + s2hat, t0hat, context); nonce++; if (ret == 0) { @@ -842,15 +942,16 @@ int mld_sign_signature_internal(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, } /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - MLD_FREE(s2, mld_polyveck, 1, context); - MLD_FREE(t0, mld_polyveck, 1, context); - MLD_FREE(s1, mld_polyvecl, 1, context); + MLD_FREE(s2hat, mld_sk_s2hat, 1, context); + MLD_FREE(t0hat, mld_sk_t0hat, 1, context); + MLD_FREE(s1hat, mld_sk_s1hat, 1, context); MLD_FREE(mat, mld_polymat, 1, context); MLD_FREE(seedbuf, uint8_t, 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + 2 * MLDSA_CRHBYTES, context); return ret; } +#if !defined(MLD_CONFIG_CORE_API_ONLY) #if !defined(MLD_CONFIG_NO_RANDOMIZED_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API @@ -887,7 +988,7 @@ int mld_sign_signature(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, ret = MLD_ERR_RNG_FAIL; goto cleanup; } - MLD_CT_TESTING_SECRET(rnd, sizeof(rnd)); + MLD_CT_TESTING_SECRET(rnd, MLDSA_RNDBYTES); ret = mld_sign_signature_internal(sig, siglen, m, mlen, pre, pre_len, rnd, sk, 0, context); @@ -921,8 +1022,15 @@ int mld_sign_signature_extmu(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) { - MLD_ALIGN uint8_t rnd[MLDSA_RNDBYTES]; int ret; + MLD_ALLOC(rnd, uint8_t, MLDSA_RNDBYTES, context); + + if (rnd == NULL) + { + *siglen = 0; + ret = MLD_ERR_OUT_OF_MEMORY; + goto cleanup; + } /* Randomized variant of ML-DSA. If you need the deterministic variant, * call mld_sign_signature_internal directly with all-zero rnd. */ @@ -932,14 +1040,14 @@ int mld_sign_signature_extmu(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, ret = MLD_ERR_RNG_FAIL; goto cleanup; } - MLD_CT_TESTING_SECRET(rnd, sizeof(rnd)); + MLD_CT_TESTING_SECRET(rnd, MLDSA_RNDBYTES); ret = mld_sign_signature_internal(sig, siglen, mu, MLDSA_CRHBYTES, NULL, 0, rnd, sk, 1, context); cleanup: /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - mld_zeroize(rnd, sizeof(rnd)); + MLD_FREE(rnd, uint8_t, MLDSA_RNDBYTES, context); return ret; } @@ -960,17 +1068,24 @@ int mld_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, __loop__( assigns(i, object_whole(sm)) invariant(i <= mlen) + decreases(mlen - i) ) { sm[MLDSA_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; } ret = mld_sign_signature(sm, smlen, sm + MLDSA_CRYPTO_BYTES, mlen, ctx, ctxlen, sk, context); - *smlen += mlen; + if (ret == 0) + { + *smlen += mlen; + } return ret; } #endif /* !MLD_CONFIG_NO_RANDOMIZED_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, @@ -981,39 +1096,24 @@ int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, MLD_CONFIG_CONTEXT_PARAMETER_TYPE context) { int ret, cmp; - - /* TODO: Remove the following workaround for - * https://github.com/diffblue/cbmc/issues/8813 */ - typedef MLK_UNION_OR_STRUCT - { - mld_polyveck t1; - mld_polyveck w1; - } - t1w1_u; - mld_polyveck *t1; - mld_polyveck *w1; + unsigned int i; MLD_ALLOC(buf, uint8_t, (MLDSA_K * MLDSA_POLYW1_PACKEDBYTES), context); - MLD_ALLOC(rho, uint8_t, MLDSA_SEEDBYTES, context); MLD_ALLOC(mu, uint8_t, MLDSA_CRHBYTES, context); MLD_ALLOC(c, uint8_t, MLDSA_CTILDEBYTES, context); MLD_ALLOC(c2, uint8_t, MLDSA_CTILDEBYTES, context); + MLD_ALLOC(z, mld_polyvecl, 1, context); MLD_ALLOC(cp, mld_poly, 1, context); MLD_ALLOC(mat, mld_polymat, 1, context); - MLD_ALLOC(z, mld_polyvecl, 1, context); - MLD_ALLOC(t1w1, t1w1_u, 1, context); - MLD_ALLOC(tmp, mld_polyveck, 1, context); - MLD_ALLOC(h, mld_polyveck, 1, context); + MLD_ALLOC(w1, mld_poly, 1, context); + MLD_ALLOC(tmp, mld_poly, 1, context); - if (buf == NULL || rho == NULL || mu == NULL || c == NULL || c2 == NULL || - cp == NULL || mat == NULL || z == NULL || t1w1 == NULL || tmp == NULL || - h == NULL) + if (buf == NULL || mu == NULL || c == NULL || c2 == NULL || z == NULL || + cp == NULL || mat == NULL || w1 == NULL || tmp == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; } - t1 = &t1w1->t1; - w1 = &t1w1->w1; if (siglen != MLDSA_CRYPTO_BYTES) { @@ -1021,16 +1121,11 @@ int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, goto cleanup; } - mld_unpack_pk(rho, t1, pk); + mld_memcpy(c, sig, MLDSA_CTILDEBYTES); + mld_polyvecl_unpack_z(z, sig + MLDSA_CTILDEBYTES); - /* mld_unpack_sig and mld_polyvecl_chknorm signal failure through a - * single non-zero error code that's not yet aligned with MLD_ERR_XXX. - * Map it to MLD_ERR_FAIL explicitly. */ - if (mld_unpack_sig(c, z, h, sig)) - { - ret = MLD_ERR_FAIL; - goto cleanup; - } + /* mld_polyvecl_chknorm signals failure through a single non-zero error code + * that's not yet aligned with MLD_ERR_XXX. Map it to MLD_ERR_FAIL. */ if (mld_polyvecl_chknorm(z, MLDSA_GAMMA1 - MLDSA_BETA)) { ret = MLD_ERR_FAIL; @@ -1054,24 +1149,51 @@ int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, mld_memcpy(mu, m, MLDSA_CRHBYTES); } - /* Matrix-vector multiplication; compute Az - c2^dt1 */ + /* Matrix-vector multiplication and per-row reconstruction of w1. */ + mld_polyvecl_ntt(z); + mld_polyvec_matrix_expand(mat, pk); mld_poly_challenge(cp, c); mld_poly_ntt(cp); - mld_polyveck_shiftl(t1); - mld_polyveck_ntt(t1); - mld_polyveck_pointwise_poly_montgomery(tmp, cp, t1); - mld_polyvec_matrix_expand(mat, rho); - mld_polyvecl_ntt(z); - mld_polyvec_matrix_pointwise_montgomery(w1, mat, z); - mld_polyveck_sub(w1, tmp); - mld_polyveck_reduce(w1); - mld_polyveck_invntt_tomont(w1); - - /* Reconstruct w1 */ - mld_polyveck_caddq(w1); - mld_polyveck_use_hint(tmp, w1, h); - mld_polyveck_pack_w1(buf, tmp); + for (i = 0; i < MLDSA_K; ++i) + __loop__( + assigns(MLD_IF_REDUCE_RAM(memory_slice(mat, sizeof(mld_polymat)),) + i, ret, + memory_slice(w1, sizeof(mld_poly)), + memory_slice(tmp, sizeof(mld_poly)), + memory_slice(buf, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES) + ) + invariant(i <= MLDSA_K) + decreases(MLDSA_K - i) + ) + { + /* w1 = (A * z)_i in NTT domain */ + mld_polyvec_matrix_pointwise_montgomery_row(w1, mat, z, i); + + /* tmp = c * t1_i * 2^d in NTT domain */ + mld_unpack_pk_t1(tmp, pk, i); + mld_poly_shiftl(tmp); + mld_poly_ntt(tmp); + mld_poly_pointwise_montgomery(tmp, cp); + + /* w1 = invNTT(w1 - c * t1_i * 2^d) */ + mld_poly_sub(w1, tmp); + mld_poly_reduce(w1); + mld_poly_invntt_tomont(w1); + mld_poly_caddq(w1); + + /* tmp = h_i (decoded and validated from signature) */ + ret = mld_sig_unpack_hints(tmp, sig, i); + if (ret != 0) + { + goto cleanup; + } + + /* w1 = use_hint(w1, tmp), then pack into buf[i] */ + mld_poly_use_hint(w1, tmp); + mld_polyw1_pack(buf + i * MLDSA_POLYW1_PACKEDBYTES, w1); + } + /* Call random oracle and verify challenge */ mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf, MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); @@ -1085,20 +1207,19 @@ int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, cleanup: /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - MLD_FREE(h, mld_polyveck, 1, context); - MLD_FREE(tmp, mld_polyveck, 1, context); - MLD_FREE(t1w1, t1w1_u, 1, context); - MLD_FREE(z, mld_polyvecl, 1, context); + MLD_FREE(tmp, mld_poly, 1, context); + MLD_FREE(w1, mld_poly, 1, context); MLD_FREE(mat, mld_polymat, 1, context); MLD_FREE(cp, mld_poly, 1, context); + MLD_FREE(z, mld_polyvecl, 1, context); MLD_FREE(c2, uint8_t, MLDSA_CTILDEBYTES, context); MLD_FREE(c, uint8_t, MLDSA_CTILDEBYTES, context); MLD_FREE(mu, uint8_t, MLDSA_CRHBYTES, context); - MLD_FREE(rho, uint8_t, MLDSA_SEEDBYTES, context); MLD_FREE(buf, uint8_t, (MLDSA_K * MLDSA_POLYW1_PACKEDBYTES), context); return ret; } +#if !defined(MLD_CONFIG_CORE_API_ONLY) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, @@ -1165,6 +1286,7 @@ int mld_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, __loop__( assigns(i, memory_slice(m, *mlen)) invariant(i <= *mlen) + decreases(*mlen - i) ) { m[i] = sm[MLDSA_CRYPTO_BYTES + i]; @@ -1182,8 +1304,11 @@ int mld_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, return ret; } +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_VERIFY_API */ - +#if !defined(MLD_CONFIG_CORE_API_ONLY) +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_pre_hash_internal( @@ -1224,7 +1349,9 @@ int mld_sign_signature_pre_hash_internal( mld_zeroize(pre, sizeof(pre)); return ret; } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_pre_hash_internal( @@ -1253,7 +1380,9 @@ int mld_sign_verify_pre_hash_internal( mld_zeroize(pre, sizeof(pre)); return ret; } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_pre_hash_shake256( @@ -1273,7 +1402,9 @@ int mld_sign_signature_pre_hash_shake256( mld_zeroize(ph, sizeof(ph)); return ret; } +#endif /* !MLD_CONFIG_NO_SIGN_API */ +#if !defined(MLD_CONFIG_NO_VERIFY_API) MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_pre_hash_shake256( @@ -1292,19 +1423,17 @@ int mld_sign_verify_pre_hash_shake256( mld_zeroize(ph, sizeof(ph)); return ret; } +#endif /* !MLD_CONFIG_NO_VERIFY_API */ - +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) #define MLD_PRE_HASH_OID_LEN 11 -/************************************************* - * Name: mld_get_hash_oid - * - * Description: Returns the OID of a given SHA-2/SHA-3 hash function. +/** + * Return the OID of a given SHA-2/SHA-3 hash function. * - * Arguments: - uint8_t oid[11]: pointer to output oid - * - int hashalg: hash algorithm constant (MLD_PREHASH_*) - * - ***************************************************/ + * @param[out] oid Pointer to output OID. + * @param hashalg Hash algorithm constant (MLD_PREHASH_*). + */ static void mld_get_hash_oid(uint8_t oid[MLD_PRE_HASH_OID_LEN], int hashalg) { unsigned int i; @@ -1341,6 +1470,7 @@ static void mld_get_hash_oid(uint8_t oid[MLD_PRE_HASH_OID_LEN], int hashalg) for (i = 0; i < sizeof(oid_map) / sizeof(oid_map[0]); i++) __loop__( invariant(i <= sizeof(oid_map) / sizeof(oid_map[0])) + decreases(sizeof(oid_map) / sizeof(oid_map[0]) - i) ) { if (oid_map[i].alg == hashalg) @@ -1418,7 +1548,9 @@ size_t mld_prepare_domain_separation_prefix( mld_memcpy(prefix + 2 + ctxlen + MLD_PRE_HASH_OID_LEN, ph, phlen); return 2 + ctxlen + MLD_PRE_HASH_OID_LEN + phlen; } +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) MLD_EXTERNAL_API int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES], @@ -1432,36 +1564,53 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], MLD_ALLOC(key, uint8_t, MLDSA_SEEDBYTES, context); MLD_ALLOC(s1, mld_polyvecl, 1, context); MLD_ALLOC(s2, mld_polyveck, 1, context); - MLD_ALLOC(t0, mld_polyveck, 1, context); - MLD_ALLOC(t0_computed, mld_polyveck, 1, context); - MLD_ALLOC(t1, mld_polyveck, 1, context); + MLD_ALLOC(t0_packed, uint8_t, MLDSA_K *MLDSA_POLYT0_PACKEDBYTES, context); if (rho == NULL || tr == NULL || tr_computed == NULL || key == NULL || - s1 == NULL || s2 == NULL || t0 == NULL || t0_computed == NULL || - t1 == NULL) + s1 == NULL || s2 == NULL || t0_packed == NULL) { ret = MLD_ERR_OUT_OF_MEMORY; goto cleanup; } - /* Unpack secret key */ - mld_unpack_sk(rho, tr, key, t0, s1, s2, sk); + /* Inline unpack_sk: mld_unpack_sk uses lazy types for s1/s2/t0 which + * we cannot use here. t0 stays in packed form -- we compare it against + * the recomputed value below. */ + mld_memcpy(rho, sk, MLDSA_SEEDBYTES); + mld_memcpy(key, sk + MLDSA_SEEDBYTES, MLDSA_SEEDBYTES); + mld_memcpy(tr, sk + 2 * MLDSA_SEEDBYTES, MLDSA_TRBYTES); + mld_polyvecl_unpack_eta(s1, sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES); + mld_polyveck_unpack_eta(s2, sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES); /* Validate s1 and s2 coefficients are within [-MLDSA_ETA, MLDSA_ETA] */ chk1 = mld_polyvecl_chknorm(s1, MLDSA_ETA + 1) & 0xFF; chk2 = mld_polyveck_chknorm(s2, MLDSA_ETA + 1) & 0xFF; - /* Recompute t0, t1, tr, and pk from rho, s1, s2 */ - ret = mld_compute_t0_t1_tr_from_sk_components(t0_computed, t1, tr_computed, - pk, rho, s1, s2, context); + /* NTT s1 in place to use as s1hat */ + mld_polyvecl_ntt(s1); + + /* Pack rho into pk */ + mld_memcpy(pk, rho, MLDSA_SEEDBYTES); + + /* Recompute t row by row, decompose, and pack t1 into pk and t0 into + * t0_packed. */ + ret = mld_compute_pack_t0_t1(pk + MLDSA_SEEDBYTES, t0_packed, s1, s2, rho, + context); if (ret != 0) { goto cleanup; } - /* Validate t0 and tr using constant-time comparisons */ - cmp0 = mld_ct_memcmp((const uint8_t *)t0, (const uint8_t *)t0_computed, - sizeof(mld_polyveck)); + /* Compare recomputed packed t0 against the t0 region of sk. */ + cmp0 = mld_ct_memcmp(t0_packed, + sk + 2 * MLDSA_SEEDBYTES + MLDSA_TRBYTES + + MLDSA_L * MLDSA_POLYETA_PACKEDBYTES + + MLDSA_K * MLDSA_POLYETA_PACKEDBYTES, + MLDSA_K * MLDSA_POLYT0_PACKEDBYTES); + + /* Compute tr_computed = H(pk) and compare to the stored tr */ + mld_shake256(tr_computed, MLDSA_TRBYTES, pk, MLDSA_CRYPTO_PUBLICKEYBYTES); cmp1 = mld_ct_memcmp((const uint8_t *)tr, (const uint8_t *)tr_computed, MLDSA_TRBYTES); check = mld_value_barrier_u8(cmp0 | cmp1 | chk1 | chk2); @@ -1481,9 +1630,7 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], MLD_CT_TESTING_DECLASSIFY(pk, MLDSA_CRYPTO_PUBLICKEYBYTES); /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */ - MLD_FREE(t1, mld_polyveck, 1, context); - MLD_FREE(t0_computed, mld_polyveck, 1, context); - MLD_FREE(t0, mld_polyveck, 1, context); + MLD_FREE(t0_packed, uint8_t, MLDSA_K *MLDSA_POLYT0_PACKEDBYTES, context); MLD_FREE(s2, mld_polyveck, 1, context); MLD_FREE(s1, mld_polyvecl, 1, context); MLD_FREE(key, uint8_t, MLDSA_SEEDBYTES, context); @@ -1493,6 +1640,8 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], return ret; } +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ @@ -1503,6 +1652,8 @@ int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], #undef mld_H #undef mld_compute_pack_z #undef mld_attempt_signature_generation -#undef mld_compute_t0_t1_tr_from_sk_components +#undef mld_compute_pack_t0_t1 +#undef mld_get_max_signing_attempts #undef MLD_NONCE_UB +#undef MLD_CONFIG_MAX_SIGNING_ATTEMPTS #undef MLD_PRE_HASH_OID_LEN diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sign.h b/crypto/fipsmodule/ml_dsa/mldsa/sign.h index 2e0bac32ead..6a2643abb1e 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/sign.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/sign.h @@ -16,7 +16,6 @@ #define MLD_SIGN_H #include -#include #include "cbmc.h" #include "common.h" #include "poly.h" @@ -72,9 +71,7 @@ #define mld_sign_pk_from_sk \ MLD_NAMESPACE_KL(pk_from_sk) MLD_CONTEXT_PARAMETERS_2 -/************************************************* - * Hash algorithm constants for domain separation - **************************************************/ +/* Hash algorithm constants for domain separation */ #define MLD_PREHASH_NONE 0 #define MLD_PREHASH_SHA2_224 1 #define MLD_PREHASH_SHA2_256 2 @@ -89,27 +86,35 @@ #define MLD_PREHASH_SHAKE_128 11 #define MLD_PREHASH_SHAKE_256 12 -/************************************************* - * Name: mld_sign_keypair_internal - * - * Description: Generates public and private key. Internal API. - * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise - * Consistency Test (PCT) as required by FIPS 140-3 IG. - * - * Arguments: - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: output public key - * - uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: output private key - * - const uint8_t seed[MLDSA_SEEDBYTES]: input random seed - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure, incl. PCT failure - * if MLD_CONFIG_KEYGEN_PCT is enabled. - * - * Specification: Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)] - * - **************************************************/ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +/** + * Generate a public-private key pair from a seed. + * + * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise Consistency Test + * (PCT) as required by FIPS 140-3 IG. + * + * @spec{Implements @[FIPS204 Algorithm 6 (ML-DSA.KeyGen_internal)].} + * + * @param[out] pk Output public key. + * @param[out] sk Output private key. + * @param[in] seed Input random seed. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The PCT's signing step exhausted + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. Only possible when + * MLD_CONFIG_KEYGEN_PCT is enabled. + * @retval MLD_ERR_FAIL Other kinds of failure, including + * PCT failure if + * MLD_CONFIG_KEYGEN_PCT is enabled. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_keypair_internal(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], @@ -123,29 +128,38 @@ __contract__( assigns(object_whole(pk)) assigns(object_whole(sk)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || - return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL) + return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL || + return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) ); -/************************************************* - * Name: mld_sign_keypair - * - * Description: Generates public and private key. - * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise - * Consistency Test (PCT) as required by FIPS 140-3 IG. - * - * Arguments: - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: output public key - * - uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: output private key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure, incl. PCT failure - * if MLD_CONFIG_KEYGEN_PCT is enabled. - * - * Specification: Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)] - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Generate a public-private key pair. + * + * When MLD_CONFIG_KEYGEN_PCT is set, performs a Pairwise Consistency Test + * (PCT) as required by FIPS 140-3 IG. + * + * @spec{Implements @[FIPS204 Algorithm 1 (ML-DSA.KeyGen)].} + * + * @param[out] pk Output public key. + * @param[out] sk Output private key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The PCT's signing step exhausted + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. Only possible when + * MLD_CONFIG_KEYGEN_PCT is enabled. + * @retval MLD_ERR_FAIL Other kinds of failure, including + * PCT failure if + * MLD_CONFIG_KEYGEN_PCT is enabled. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_keypair(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], @@ -157,40 +171,49 @@ __contract__( assigns(object_whole(pk)) assigns(object_whole(sk)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || - return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL) + return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL || + return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) ); +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ -/************************************************* - * Name: mld_sign_signature_internal - * - * Description: Computes signature. Internal API. - * - * Arguments: - uint8_t sig[MLDSA_CRYPTO_BYTES]: output signature - * - size_t *siglen: pointer to output length of - * signature - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - const uint8_t *pre: pointer to prefix string - * - size_t prelen: length of prefix string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - int externalmu: indicates input message m is - * processed as mu - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - * - * If the returned value is non-zero, then the values of *sig and - * *siglen should not be referenced. - * - * Reference: This code differs from the reference implementation - * in that it adds an explicit check for nonce exhaustion - * and can return MLD_ERR_FAIL in that case. - **************************************************/ +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * Compute signature using internal randomness. + * + * If the returned value is non-zero, then the values of *sig and *siglen + * should not be referenced. + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be signed (when + * externalmu == 0), or to a precomputed + * message representative mu (when externalmu != 0). + * @param mlen Length of m. Must equal MLDSA_CRHBYTES when + * externalmu != 0. + * @param[in] pre Pointer to prefix string. Ignored when + * externalmu != 0. + * @param prelen Length of prefix string. Ignored when + * externalmu != 0. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param externalmu 0: m/mlen is the raw message; mu = H(tr, pre, m) is + * computed internally. + * non-zero: m points to a precomputed mu of + * MLDSA_CRHBYTES bytes; pre/prelen unused. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_internal(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, @@ -208,44 +231,46 @@ __contract__( requires(memory_no_alias(m, mlen)) requires(memory_no_alias(rnd, MLDSA_RNDBYTES)) requires(memory_no_alias(sk, MLDSA_CRYPTO_SECRETKEYBYTES)) - requires((externalmu == 0 && (prelen == 0 || memory_no_alias(pre, prelen))) || - (externalmu == 1 && mlen == MLDSA_CRHBYTES)) + requires((externalmu == 0) ==> ((prelen == 0) || memory_no_alias(pre, prelen))) + requires((externalmu != 0) ==> (mlen == MLDSA_CRHBYTES)) assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(object_whole(siglen)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || - return_value == MLD_ERR_OUT_OF_MEMORY) + return_value == MLD_ERR_OUT_OF_MEMORY || + return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) ensures(return_value == 0 ==> *siglen == MLDSA_CRYPTO_BYTES) ensures(return_value != 0 ==> *siglen == 0) ); -/************************************************* - * Name: mld_sign_signature - * - * Description: Computes signature. This function implements the randomized - * variant of ML-DSA. If you require the deterministic variant, - * use mld_sign_signature_internal directly. - * - * Arguments: - uint8_t sig[MLDSA_CRYPTO_BYTES]: output signature - * - size_t *siglen: pointer to output length of - * signature - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - uint8_t *ctx: pointer to context string. - * May be NULL if ctxlen == 0. - * - size_t ctxlen: length of context string. - * Should be <= 255. - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure. - * - * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)]. - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Compute signature. This function implements the randomized variant of + * ML-DSA. If you require the deterministic variant, use + * mld_sign_signature_internal directly. + * + * @spec{Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign)].} + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. May be NULL if ctxlen == 0. + * @param ctxlen Length of context string. Should be <= 255. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, @@ -264,34 +289,37 @@ __contract__( assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(object_whole(siglen)) ensures((return_value == 0 && *siglen == MLDSA_CRYPTO_BYTES) || - ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL) && *siglen == 0)) + ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) && *siglen == 0)) ); -/************************************************* - * Name: mld_sign_signature_extmu - * - * Description: Computes signature. This function implements the randomized - * variant of ML-DSA. If you require the deterministic variant, - * use mld_sign_signature_internal directly. - * - * Arguments: - uint8_t sig[MLDSA_CRYPTO_BYTES]: output signature - * - size_t *siglen: pointer to output length of - * signature - * - const uint8_t mu[MLDSA_CRHBYTES]: - * input mu to be signed - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_RNG_FAIL: Random number generation failed. - * - MLD_ERR_FAIL: Other kinds of failure. - * - * Specification: Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu - * variant)] - * - **************************************************/ +/** + * Compute signature in "external mu" mode: the caller has already computed + * the message representative mu = SHAKE256(tr || M', 64), where + * tr = SHAKE256(pk, 64) and M' is the FIPS 204 formatted message (e.g. + * 0x00 || ctxlen || ctx || msg for pure ML-DSA). This is the randomized + * variant; for the deterministic variant, use mld_sign_signature_internal + * directly with externalmu set to non-zero and an all-zero rnd. + * + * @spec{Implements @[FIPS204 Algorithm 2 (ML-DSA.Sign external mu variant)].} + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] mu Precomputed message representative. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_RNG_FAIL Random number generation failed. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_extmu(uint8_t sig[MLDSA_CRYPTO_BYTES], size_t *siglen, @@ -306,31 +334,33 @@ __contract__( assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(object_whole(siglen)) ensures((return_value == 0 && *siglen == MLDSA_CRYPTO_BYTES) || - ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL) && *siglen == 0)) + ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_RNG_FAIL || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) && *siglen == 0)) ); -/************************************************* - * Name: mld_sign - * - * Description: Compute signed message. - * - * Arguments: - uint8_t *sm: pointer to output signed message - * (allocated array with MLDSA_CRYPTO_BYTES + - *mlen bytes), can be equal to m - * - size_t *smlen: pointer to output length of signed message - * - const uint8_t *m: pointer to message to be signed - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - * - **************************************************/ +/** + * Compute signed message. + * + * @param[out] sm Pointer to output signed message (allocated array with + * MLDSA_CRYPTO_BYTES + mlen bytes); can be equal to m. + * @param[out] smlen Pointer to output length of signed message. + * @param[in] m Pointer to message to be signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, @@ -348,35 +378,43 @@ __contract__( assigns(memory_slice(sm, MLDSA_CRYPTO_BYTES + mlen)) assigns(object_whole(smlen)) ensures((return_value == 0 && *smlen == MLDSA_CRYPTO_BYTES + mlen) || - (return_value == MLD_ERR_FAIL - || return_value == MLD_ERR_OUT_OF_MEMORY - || return_value == MLD_ERR_RNG_FAIL)) + ((return_value == MLD_ERR_FAIL + || return_value == MLD_ERR_OUT_OF_MEMORY + || return_value == MLD_ERR_RNG_FAIL + || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) && *smlen == 0)) ); +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_SIGN_API */ -/************************************************* - * Name: mld_sign_verify_internal - * - * Description: Verifies signature. Internal API. - * - * Arguments: - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message - * - size_t mlen: length of message - * - const uint8_t *pre: pointer to prefix string - * - size_t prelen: length of prefix string - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - int externalmu: indicates input message m is processed as - * mu - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)] - * - **************************************************/ +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * Verify signature. + * + * @spec{Implements @[FIPS204 Algorithm 8 (ML-DSA.Verify_internal)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message (when externalmu == 0), or to a + * precomputed message representative mu (when + * externalmu != 0). + * @param mlen Length of m. Must equal MLDSA_CRHBYTES when + * externalmu != 0. + * @param[in] pre Pointer to prefix string. Ignored when externalmu != 0. + * @param prelen Length of prefix string. Ignored when externalmu != 0. + * @param[in] pk Bit-packed public key. + * @param externalmu 0: m/mlen is the raw message; mu = H(H(pk), pre, m) is + * computed internally. + * non-zero: m points to a precomputed mu of + * MLDSA_CRHBYTES bytes; pre/prelen unused. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_internal(const uint8_t *sig, size_t siglen, @@ -391,35 +429,34 @@ __contract__( requires(siglen <= MLD_MAX_BUFFER_SIZE) requires(memory_no_alias(sig, siglen)) requires(memory_no_alias(m, mlen)) - requires(externalmu == 0 || (externalmu == 1 && mlen == MLDSA_CRHBYTES)) - requires(externalmu == 1 || prelen == 0 || memory_no_alias(pre, prelen)) + requires((externalmu == 0) ==> ((prelen == 0) || memory_no_alias(pre, prelen))) + requires((externalmu != 0) ==> (mlen == MLDSA_CRHBYTES)) requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); -/************************************************* - * Name: mld_sign_verify - * - * Description: Verifies signature. - * - * Arguments: - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string. - * May be NULL if ctxlen == 0. - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)] - * - **************************************************/ +#if !defined(MLD_CONFIG_CORE_API_ONLY) +/** + * Verify signature. + * + * @spec{Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. May be NULL if ctxlen == 0. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, @@ -437,27 +474,28 @@ __contract__( ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); -/************************************************* - * Name: mld_sign_verify_extmu - * - * Description: Verifies signature. - * - * Arguments: - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t mu[MLDSA_CRHBYTES]: - * input mu - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - * Specification: Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu - * variant)] - * - **************************************************/ +/** + * Verify signature in "external mu" mode: the caller has already computed + * the message representative mu = SHAKE256(tr || M', 64), where + * tr = SHAKE256(pk, 64) and M' is the FIPS 204 formatted message (e.g. + * 0x00 || ctxlen || ctx || msg for pure ML-DSA). The same mu must have been + * used at signing time. + * + * @spec{Implements @[FIPS204 Algorithm 3 (ML-DSA.Verify external mu variant)].} + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] mu Precomputed message representative. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_extmu(const uint8_t *sig, size_t siglen, @@ -472,27 +510,26 @@ __contract__( ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); -/************************************************* - * Name: mld_sign_open - * - * Description: Verify signed message. - * - * Arguments: - uint8_t *m: pointer to output message (allocated array - * with smlen bytes), can be equal to sm - * - size_t *mlen: pointer to output length of message - * - const uint8_t *sm: pointer to signed message - * - size_t smlen: length of signed message - * - const uint8_t *ctx: pointer to context tring - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - **************************************************/ +/** + * Verify signed message. + * + * @param[out] m Pointer to output message (allocated array with smlen + * bytes); can be equal to sm. + * @param[out] mlen Pointer to output length of message. + * @param[in] sm Pointer to signed message. + * @param smlen Length of signed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, @@ -511,41 +548,46 @@ __contract__( assigns(memory_slice(mlen, sizeof(size_t))) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); +#endif /* !MLD_CONFIG_CORE_API_ONLY */ +#endif /* !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Name: mld_sign_signature_pre_hash_internal - * - * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign. - * Computes signature with pre-hashed message. - * - * Arguments: - uint8_t sig[MLDSA_CRYPTO_BYTES]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *ph: pointer to pre-hashed message - * - size_t phlen: length of pre-hashed message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - int hashalg: hash algorithm constant (one of - * MLD_PREHASH_*) - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure +#if !defined(MLD_CONFIG_CORE_API_ONLY) +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * FIPS 204: Algorithm 4 HashML-DSA.Sign. Compute signature with pre-hashed + * message. * * Supported hash algorithm constants: * MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384, * MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256, * MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384, - * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256 + * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256. * - * Warning: This is an unstable API that may change in the future. If you need + * @warning This is an unstable API that may change in the future. If you need * a stable API use mld_sign_signature_pre_hash_shake256. - **************************************************/ + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] ph Pointer to pre-hashed message. + * @param phlen Length of pre-hashed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param hashalg Hash algorithm constant (one of MLD_PREHASH_*). + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_pre_hash_internal( @@ -566,40 +608,41 @@ __contract__( assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(object_whole(siglen)) ensures((return_value == 0 && *siglen == MLDSA_CRYPTO_BYTES) || - ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) && *siglen == 0)) + ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) && *siglen == 0)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -/************************************************* - * Name: mld_sign_verify_pre_hash_internal - * - * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify. - * Verifies signature with pre-hashed message. - * - * Arguments: - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *ph: pointer to pre-hashed message - * - size_t phlen: length of pre-hashed message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - int hashalg: hash algorithm constant (one of - * MLD_PREHASH_*) - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * FIPS 204: Algorithm 5 HashML-DSA.Verify. Verify signature with pre-hashed + * message. * * Supported hash algorithm constants: * MLD_PREHASH_SHA2_224, MLD_PREHASH_SHA2_256, MLD_PREHASH_SHA2_384, * MLD_PREHASH_SHA2_512, MLD_PREHASH_SHA2_512_224, MLD_PREHASH_SHA2_512_256, * MLD_PREHASH_SHA3_224, MLD_PREHASH_SHA3_256, MLD_PREHASH_SHA3_384, - * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256 + * MLD_PREHASH_SHA3_512, MLD_PREHASH_SHAKE_128, MLD_PREHASH_SHAKE_256. * - * Warning: This is an unstable API that may change in the future. If you need + * @warning This is an unstable API that may change in the future. If you need * a stable API use mld_sign_verify_pre_hash_shake256. - **************************************************/ + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] ph Pointer to pre-hashed message. + * @param phlen Length of pre-hashed message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param hashalg Hash algorithm constant (one of MLD_PREHASH_*). + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_pre_hash_internal( @@ -617,33 +660,36 @@ __contract__( requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Name: mld_sign_signature_pre_hash_shake256 - * - * Description: FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256. - * Computes signature with pre-hashed message using SHAKE256. - * This function computes the SHAKE256 hash of the message - *internally. - * - * Arguments: - uint8_t sig[MLDSA_CRYPTO_BYTES]: - * output signature - * - size_t *siglen: pointer to output length of signature - * - const uint8_t *m: pointer to message to be hashed and signed - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t rnd[MLDSA_RNDBYTES]: - * random seed - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: - * bit-packed secret key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Other kinds of failure - * - **************************************************/ +#if !defined(MLD_CONFIG_NO_SIGN_API) +/** + * FIPS 204: Algorithm 4 HashML-DSA.Sign with SHAKE256. + * + * Compute signature with pre-hashed message using SHAKE256. This function + * computes the SHAKE256 hash of the message internally. + * + * @param[out] sig Output signature. + * @param[out] siglen Pointer to output length of signature. + * @param[in] m Pointer to message to be hashed and signed. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] rnd Random seed. + * @param[in] sk Bit-packed secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was + * used and an allocation via + * MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED The rejection-sampling loop exceeded + * MLD_CONFIG_MAX_SIGNING_ATTEMPTS + * iterations. + * @retval MLD_ERR_FAIL Other kinds of failure. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_signature_pre_hash_shake256( @@ -664,33 +710,33 @@ __contract__( assigns(memory_slice(sig, MLDSA_CRYPTO_BYTES)) assigns(object_whole(siglen)) ensures((return_value == 0 && *siglen == MLDSA_CRYPTO_BYTES) || - ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) && *siglen == 0)) + ((return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY || return_value == MLD_ERR_SIGN_ATTEMPTS_EXHAUSTED) && *siglen == 0)) ); +#endif /* !MLD_CONFIG_NO_SIGN_API */ -/************************************************* - * Name: mld_sign_verify_pre_hash_shake256 - * - * Description: FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256. - * Verifies signature with pre-hashed message using SHAKE256. - * This function computes the SHAKE256 hash of the message - * internally. - * - * Arguments: - const uint8_t *sig: pointer to input signature - * - size_t siglen: length of signature - * - const uint8_t *m: pointer to message to be hashed and - * verified - * - size_t mlen: length of message - * - const uint8_t *ctx: pointer to context string - * - size_t ctxlen: length of context string - * - const uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: - * bit-packed public key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Signature verification failed - * - **************************************************/ +#if !defined(MLD_CONFIG_NO_VERIFY_API) +/** + * FIPS 204: Algorithm 5 HashML-DSA.Verify with SHAKE256. + * + * Verify signature with pre-hashed message using SHAKE256. This function + * computes the SHAKE256 hash of the message internally. + * + * @param[in] sig Pointer to input signature. + * @param siglen Length of signature. + * @param[in] m Pointer to message to be hashed and verified. + * @param mlen Length of message. + * @param[in] ctx Pointer to context string. + * @param ctxlen Length of context string. + * @param[in] pk Bit-packed public key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Signature verification failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_verify_pre_hash_shake256( @@ -708,50 +754,47 @@ __contract__( requires(memory_no_alias(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); +#endif /* !MLD_CONFIG_NO_VERIFY_API */ +#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) /* Maximum formatted domain separation message length: * - Pure ML-DSA: 0x00 || ctxlen || ctx (max 255) * - HashML-DSA: 0x01 || ctxlen || ctx (max 255) || oid (11) || ph (max 64) */ #define MLD_DOMAIN_SEPARATION_MAX_BYTES (2 + 255 + 11 + 64) -/************************************************* - * Name: mld_prepare_domain_separation_prefix - * - * Description: Prepares domain separation prefix for ML-DSA signing. - * For pure ML-DSA (hashalg == MLD_PREHASH_NONE): - * Format: 0x00 || ctxlen (1 byte) || ctx - * For HashML-DSA (hashalg != MLD_PREHASH_NONE): - * Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph - * - * Arguments: - uint8_t prefix[MLD_DOMAIN_SEPARATION_MAX_BYTES]: - * output domain separation prefix buffer - * - const uint8_t *ph: pointer to pre-hashed message - * (ignored for pure ML-DSA) - * - size_t phlen: length of pre-hashed message - * (ignored for pure ML-DSA) - * - const uint8_t *ctx: pointer to context string (may be NULL) - * - size_t ctxlen: length of context string - * - int hashalg: hash algorithm constant - * (MLD_PREHASH_NONE for pure ML-DSA, or MLD_PREHASH_* for - * HashML-DSA) - * - * Returns the total length of the formatted prefix, or 0 on error. +/** + * Prepare domain separation prefix for ML-DSA signing. + * + * For pure ML-DSA (hashalg == MLD_PREHASH_NONE): + * Format: 0x00 || ctxlen (1 byte) || ctx. + * + * For HashML-DSA (hashalg != MLD_PREHASH_NONE): + * Format: 0x01 || ctxlen (1 byte) || ctx || oid (11 bytes) || ph. * * This function is useful for building incremental signing APIs. * - * Specification: - * - For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements - * @[FIPS204, Algorithm 4, L23] - * - For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), implements - * ``` - * M' <- BytesToBits(IntegerToBytes(0, 1) - * || IntegerToBytes(|ctx|, 1) - * || ctx - * ``` - * which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and - * @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5]. - * - **************************************************/ + * @spec{For HashML-DSA (hashalg != MLD_PREHASH_NONE), implements + * @[FIPS204, Algorithm 4, L23]. For Pure ML-DSA (hashalg == MLD_PREHASH_NONE), + * implements + * ``` + * M' <- BytesToBits(IntegerToBytes(0, 1) + * || IntegerToBytes(|ctx|, 1) + * || ctx + * ``` + * which is part of @[FIPS204, Algorithm 2 (ML-DSA.Sign), L10] and + * @[FIPS204, Algorithm 3 (ML-DSA.Verify), L5].} + * + * @param[out] prefix Output domain separation prefix buffer. + * @param[in] ph Pointer to pre-hashed message (ignored for pure + * ML-DSA). + * @param phlen Length of pre-hashed message (ignored for pure ML-DSA). + * @param[in] ctx Pointer to context string (may be NULL). + * @param ctxlen Length of context string. + * @param hashalg Hash algorithm constant (MLD_PREHASH_NONE for pure + * ML-DSA, or MLD_PREHASH_* for HashML-DSA). + * + * @return The total length of the formatted prefix, or 0 on error. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API size_t mld_prepare_domain_separation_prefix( @@ -766,33 +809,32 @@ __contract__( assigns(memory_slice(prefix, MLD_DOMAIN_SEPARATION_MAX_BYTES)) ensures(return_value <= MLD_DOMAIN_SEPARATION_MAX_BYTES) ); +#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ -/************************************************* - * Name: mld_sign_pk_from_sk - * - * Description: Performs basic validity checks on secret key, and derives - * public key. - * - * Referring to the decoding of the secret key - * `sk=(rho, K, tr, s1, s2, t0)` - * (cf. [@FIPS204, Algorithm 25 skDecode]), - * the following checks are performed: - * - Check that s1 and s2 have coefficients in - * [-MLDSA_ETA, MLDSA_ETA] - * - Check that t0 and tr stored in sk match recomputed values. - * - * Arguments: - uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES]: output public key - * - const uint8_t sk[MLDSA_CRYPTO_SECRETKEYBYTES]: input secret - * key - * - * Returns: - 0: Success - * - MLD_ERR_OUT_OF_MEMORY: If MLD_CONFIG_CUSTOM_ALLOC_FREE is - * used and an allocation via MLD_CUSTOM_ALLOC returned NULL. - * - MLD_ERR_FAIL: Secret key validation failed - * - * Note: This function leaks whether the secret key is valid or invalid - * through its return value and timing. - **************************************************/ +#if !defined(MLD_CONFIG_NO_KEYPAIR_API) +/** + * Perform basic validity checks on secret key, and derive public key. + * + * Referring to the decoding of the secret key `sk=(rho, K, tr, s1, s2, t0)` + * (cf. @[FIPS204, Algorithm 25 skDecode]), the following checks are + * performed: + * - Check that s1 and s2 have coefficients in [-MLDSA_ETA, MLDSA_ETA]. + * - Check that t0 and tr stored in sk match recomputed values. + * + * @note This function leaks whether the secret key is valid or invalid + * through its return value and timing. + * + * @param[out] pk Output public key. + * @param[in] sk Input secret key. + * @param context Application context. Only present when + * MLD_CONFIG_CONTEXT_PARAMETER is defined; type set by + * MLD_CONFIG_CONTEXT_PARAMETER_TYPE. + * + * @retval 0 Success. + * @retval MLD_ERR_OUT_OF_MEMORY MLD_CONFIG_CUSTOM_ALLOC_FREE was used and an + * allocation via MLD_CUSTOM_ALLOC returned NULL. + * @retval MLD_ERR_FAIL Secret key validation failed. + */ MLD_MUST_CHECK_RETURN_VALUE MLD_EXTERNAL_API int mld_sign_pk_from_sk(uint8_t pk[MLDSA_CRYPTO_PUBLICKEYBYTES], @@ -804,4 +846,7 @@ __contract__( assigns(memory_slice(pk, MLDSA_CRYPTO_PUBLICKEYBYTES)) ensures(return_value == 0 || return_value == MLD_ERR_FAIL || return_value == MLD_ERR_OUT_OF_MEMORY) ); +#endif /* !MLD_CONFIG_NO_KEYPAIR_API */ +#endif /* !MLD_CONFIG_CORE_API_ONLY */ + #endif /* !MLD_SIGN_H */ diff --git a/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h b/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h index bb70f05552f..4bbd4f19c07 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/symmetric.h @@ -5,7 +5,6 @@ #ifndef MLD_SYMMETRIC_H #define MLD_SYMMETRIC_H -#include #include "cbmc.h" #include "common.h" diff --git a/crypto/fipsmodule/ml_dsa/mldsa/sys.h b/crypto/fipsmodule/ml_dsa/mldsa/sys.h index 12265264201..27834690a97 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/sys.h +++ b/crypto/fipsmodule/ml_dsa/mldsa/sys.h @@ -44,6 +44,11 @@ #define MLD_SYS_AARCH64_EB #endif +/* Check if we're running on an Armv8.1-M system with MVE */ +#if defined(__ARM_ARCH_8_1M_MAIN__) || defined(__ARM_FEATURE_MVE) +#define MLD_SYS_ARMV81M_MVE +#endif + #if defined(__x86_64__) #define MLD_SYS_X86_64 #if defined(__AVX2__) @@ -59,6 +64,11 @@ #define MLD_SYS_RISCV64 #endif +#if defined(MLD_SYS_RISCV64) && defined(__riscv_vector) && \ + defined(__riscv_v_intrinsic) +#define MLD_SYS_RISCV64_RVV +#endif + #if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32 #define MLD_SYS_RISCV32 #endif @@ -237,6 +247,7 @@ typedef enum #if !defined(MLD_CONFIG_CUSTOM_CAPABILITY_FUNC) #include "cbmc.h" +MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_sys_check_capability(mld_sys_cap cap) __contract__( ensures(return_value == 0 || return_value == 1) diff --git a/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc b/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc index dc3cf15dd32..fa303444064 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc +++ b/crypto/fipsmodule/ml_dsa/mldsa/zetas.inc @@ -9,7 +9,6 @@ * Do not modify it directly. */ -#include /* * Table of zeta values used in the reference NTT and inverse NTT. From 3f72ba07c9692c5d82ef029af7dcb25b93f3ea94 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 22 May 2026 17:39:17 +0000 Subject: [PATCH 3/4] ML-DSA: import poly_caddq AVX2 assembly from mldsa-native Upstream mldsa-native now provides poly_caddq as verified assembly (poly_caddq_avx2_asm.S) rather than C intrinsics. Import it alongside the other proven assembly operations, add the MLD_USE_NATIVE_POLY_CADDQ declaration to our custom meta header, and drop the sed that stripped the old C-intrinsic include from the BCM. --- crypto/fipsmodule/ml_dsa/importer.sh | 13 ++- .../native/x86_64/src/poly_caddq_avx2_asm.S | 79 +++++++++++++++++++ crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h | 20 ++++- 3 files changed, 101 insertions(+), 11 deletions(-) create mode 100644 crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/poly_caddq_avx2_asm.S diff --git a/crypto/fipsmodule/ml_dsa/importer.sh b/crypto/fipsmodule/ml_dsa/importer.sh index ecb3dc9765b..2065963cd3d 100755 --- a/crypto/fipsmodule/ml_dsa/importer.sh +++ b/crypto/fipsmodule/ml_dsa/importer.sh @@ -79,9 +79,9 @@ find $TMP/mldsa/src -maxdepth 1 -type f -exec cp {} $SRC \; # Copy x86_64 backend # We import only the assembly-backed operations (NTT, INTT, nttunpack, -# pointwise, polyvecl_pointwise_acc). The AVX2 C-intrinsic operations -# (rej_uniform, decompose, use_hint, chknorm, caddq, polyz_unpack) are -# intentionally excluded. +# pointwise, polyvecl_pointwise_acc, caddq). The AVX2 C-intrinsic operations +# (rej_uniform, decompose, use_hint, chknorm, polyz_unpack) are intentionally +# excluded. # # The upstream meta.h advertises both assembly and C-intrinsic operations. # Rather than modify it, we keep a hand-maintained replacement in @@ -95,9 +95,8 @@ cp $TMP/mldsa/src/native/x86_64/src/arith_native_x86_64.h $SRC/native/x86_64/src # Shared constants (zetas table); needed by the assembly kernels cp $TMP/mldsa/src/native/x86_64/src/consts.h $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/consts.c $SRC/native/x86_64/src -# Assembly source files for the operations we import (NTT, INTT, nttunpack, -# pointwise, polyvecl_pointwise_acc). Only files with verified proofs are -# included. +# Assembly source files for the operations we import. Only files with verified +# proofs are included. cp $TMP/mldsa/src/native/x86_64/src/ntt_avx2_asm.S $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/intt_avx2_asm.S $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S $SRC/native/x86_64/src @@ -105,6 +104,7 @@ cp $TMP/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S $SRC/native/x86_64/src +cp $TMP/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S $SRC/native/x86_64/src # We use the custom `mldsa_native_config.h`, so can remove the default one rm -f $SRC/config.h @@ -149,7 +149,6 @@ sed "${SED_I[@]}" 's/#include "src\/\([^"]*\)"/#include "\1"/' $SRC/mldsa_native # Only consts.c (shared with the assembly backend) needs to be compiled. echo "Strip C-intrinsic includes from mldsa_native_bcm.c" BCM=$SRC/mldsa_native_bcm.c -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_caddq_avx2\.c"/d' "$BCM" sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_chknorm_avx2\.c"/d' "$BCM" sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_32_avx2\.c"/d' "$BCM" sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_88_avx2\.c"/d' "$BCM" diff --git a/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/poly_caddq_avx2_asm.S b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/poly_caddq_avx2_asm.S new file mode 100644 index 00000000000..af3aeab2c45 --- /dev/null +++ b/crypto/fipsmodule/ml_dsa/mldsa/native/x86_64/src/poly_caddq_avx2_asm.S @@ -0,0 +1,79 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_poly_caddq_avx2_asm + * + * Description: For all coefficients of in/out polynomial add Q if + * coefficient is negative. + * + * Arguments: - int32_t *r: pointer to input/output polynomial + **************************************************/ + +#include "_internal_s2n_bignum_x86_att.h" + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/poly_caddq_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(mldsa_poly_caddq_avx2_asm) + S2N_BN_SYM_PRIVACY_DIRECTIVE(mldsa_poly_caddq_avx2_asm) +S2N_BN_SYMBOL(mldsa_poly_caddq_avx2_asm): + + .cfi_startproc + movl $0x7fe001, %edx # imm = 0x7FE001 + leaq 0x400(%rdi), %rax + vpxor %xmm2, %xmm2, %xmm2 + vmovd %edx, %xmm1 + vpbroadcastd %xmm1, %ymm1 + +Lpoly_caddq_avx2_loop: + vpcmpgtd (%rdi), %ymm2, %ymm0 + vpand %ymm1, %ymm0, %ymm0 + vpaddd (%rdi), %ymm0, %ymm0 + vmovdqa %ymm0, (%rdi) + vpcmpgtd 0x20(%rdi), %ymm2, %ymm3 + vpand %ymm1, %ymm3, %ymm3 + vpaddd 0x20(%rdi), %ymm3, %ymm3 + vmovdqa %ymm3, 0x20(%rdi) + vpcmpgtd 0x40(%rdi), %ymm2, %ymm4 + vpand %ymm1, %ymm4, %ymm4 + vpaddd 0x40(%rdi), %ymm4, %ymm4 + vmovdqa %ymm4, 0x40(%rdi) + vpcmpgtd 0x60(%rdi), %ymm2, %ymm5 + vpand %ymm1, %ymm5, %ymm5 + vpaddd 0x60(%rdi), %ymm5, %ymm5 + vmovdqa %ymm5, 0x60(%rdi) + addq $0x80, %rdi + cmpq %rdi, %rax + jne Lpoly_caddq_avx2_loop + retq + .cfi_endproc + +S2N_BN_SIZE_DIRECTIVE(mldsa_poly_caddq_avx2_asm) + + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h b/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h index 823e2850a13..3b905ff78e2 100644 --- a/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h +++ b/crypto/fipsmodule/ml_dsa/mldsa_x86_64_meta.h @@ -5,11 +5,11 @@ * Custom x86_64 backend header for the mldsa-native import. * * mldsa-native's upstream meta.h declares native implementations for both - * assembly-backed operations (NTT, INTT, pointwise multiplication) and + * assembly-backed operations (NTT, INTT, pointwise multiplication, caddq) and * AVX2 C-intrinsic operations (rej_uniform, decompose, use_hint, chknorm, - * caddq, polyz_unpack). AWS-LC only imports the assembly-backed operations, - * so we replace the upstream meta.h with this trimmed-down version that - * declares only the subset we actually provide. + * polyz_unpack). AWS-LC only imports the assembly-backed operations, so we + * replace the upstream meta.h with this trimmed-down version that declares + * only the subset we actually provide. * * Kept outside the imported `mldsa/` tree so that `importer.sh` does not * need to modify upstream sources. @@ -25,6 +25,7 @@ #define MLD_USE_NATIVE_NTT_CUSTOM_ORDER #define MLD_USE_NATIVE_NTT #define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POLY_CADDQ #define MLD_USE_NATIVE_POINTWISE_MONTGOMERY #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4 #define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5 @@ -64,6 +65,17 @@ static MLD_INLINE int mld_intt_native(int32_t data[MLDSA_N]) return MLD_NATIVE_FUNC_SUCCESS; } +MLD_MUST_CHECK_RETURN_VALUE +static MLD_INLINE int mld_poly_caddq_native(int32_t a[MLDSA_N]) +{ + if (!mld_sys_check_capability(MLD_SYS_CAP_AVX2)) + { + return MLD_NATIVE_FUNC_FALLBACK; + } + mld_poly_caddq_avx2_asm(a); + return MLD_NATIVE_FUNC_SUCCESS; +} + MLD_MUST_CHECK_RETURN_VALUE static MLD_INLINE int mld_poly_pointwise_montgomery_native( int32_t a[MLDSA_N], const int32_t b[MLDSA_N]) From 0de2fe72f90b3022452b64c63f2813dacb406b1a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 23 May 2026 02:20:59 +0000 Subject: [PATCH 4/4] ML-DSA: simplify importer x86_64 backend copy and BCM stripping Replace individual cp lines for each .S file with a single glob, and collapse the per-file sed deletions into one pattern that removes all x86_64 C-intrinsic .c includes except consts.c. --- crypto/fipsmodule/ml_dsa/importer.sh | 37 ++++++---------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/crypto/fipsmodule/ml_dsa/importer.sh b/crypto/fipsmodule/ml_dsa/importer.sh index 2065963cd3d..e5d4eb7547a 100755 --- a/crypto/fipsmodule/ml_dsa/importer.sh +++ b/crypto/fipsmodule/ml_dsa/importer.sh @@ -78,33 +78,22 @@ mkdir $SRC find $TMP/mldsa/src -maxdepth 1 -type f -exec cp {} $SRC \; # Copy x86_64 backend -# We import only the assembly-backed operations (NTT, INTT, nttunpack, -# pointwise, polyvecl_pointwise_acc, caddq). The AVX2 C-intrinsic operations -# (rej_uniform, decompose, use_hint, chknorm, polyz_unpack) are intentionally -# excluded. +# We import all assembly (.S) files and shared headers/constants from the +# upstream x86_64 backend. The AVX2 C-intrinsic .c files (rej_uniform, +# decompose, use_hint, chknorm, polyz_unpack) are excluded — their includes +# are stripped from the BCM below. # # The upstream meta.h advertises both assembly and C-intrinsic operations. # Rather than modify it, we keep a hand-maintained replacement in # ../mldsa_x86_64_meta.h (referenced via MLD_CONFIG_ARITH_BACKEND_FILE) that # declares only the assembly-backed subset. Upstream meta.h is not copied. mkdir -p $SRC/native/x86_64/src -# Backend API and specification assumed by mldsa-native frontend cp $TMP/mldsa/src/native/api.h $SRC/native -# Backend header -- unused C-intrinsic declarations are harmless and left intact cp $TMP/mldsa/src/native/x86_64/src/arith_native_x86_64.h $SRC/native/x86_64/src -# Shared constants (zetas table); needed by the assembly kernels cp $TMP/mldsa/src/native/x86_64/src/consts.h $SRC/native/x86_64/src cp $TMP/mldsa/src/native/x86_64/src/consts.c $SRC/native/x86_64/src -# Assembly source files for the operations we import. Only files with verified -# proofs are included. -cp $TMP/mldsa/src/native/x86_64/src/ntt_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/intt_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S $SRC/native/x86_64/src -cp $TMP/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S $SRC/native/x86_64/src +# NOTE: all imported .S files must have verified proofs in s2n-bignum. +cp $TMP/mldsa/src/native/x86_64/src/*.S $SRC/native/x86_64/src # We use the custom `mldsa_native_config.h`, so can remove the default one rm -f $SRC/config.h @@ -146,20 +135,10 @@ echo "Fixup include paths" sed "${SED_I[@]}" 's/#include "src\/\([^"]*\)"/#include "\1"/' $SRC/mldsa_native_bcm.c # Drop #include directives for the C-intrinsic .c files we did not import. -# Only consts.c (shared with the assembly backend) needs to be compiled. +# Only consts.c (shared with the assembly backend) is kept. echo "Strip C-intrinsic includes from mldsa_native_bcm.c" BCM=$SRC/mldsa_native_bcm.c -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_chknorm_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_32_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_decompose_88_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_use_hint_32_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/poly_use_hint_88_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/polyz_unpack_17_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/polyz_unpack_19_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_eta2_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_eta4_avx2\.c"/d' "$BCM" -sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/rej_uniform_table\.c"/d' "$BCM" +sed "${SED_I[@]}" '/^#include "native\/x86_64\/src\/[^"]*\.c"/{/consts\.c/!d;}' "$BCM" # ================================================================ # Fixup x86_64 assembly backend to use s2n-bignum macros