From 0e13d31df173faa140d9dc465717521f2bb9e2c3 Mon Sep 17 00:00:00 2001 From: willieyz Date: Wed, 4 Mar 2026 15:51:05 +0800 Subject: [PATCH 01/27] CBMC: Introduce separate proofs for Keccak XXX_c() functions This commit introduce separate proof for: - mlk_keccakf1600_permute_c() - mlk_keccakf1600x4_extract_bytes_c() - mlk_keccakf1600x4_xor_bytes_c() For arithmetic function that have a native implementation, we have 3 CBMC proofs: 1. Proof for the pure C implementation names XXX_c() 2. Proof for the wrapper function on top of the C implementation 3. Proof for the wrapper function on top of the native function (with C fallback). This commit seperate current proofs for these three functions follow above structure. For each function, the following steps performed: - Add the corresponding CBMC contract, copied from the wrapper function. - Create a dedicated CBMC proof for the pure C implementation. - Update the existing wrapper CBMC proof Makefiles by adding XXX_C to USE_FUNCTION_CONTRACTS, and apply the same change to the native proof configuration. Signed-off-by: willieyz Signed-off-by: Danny Tsen --- mlkem/src/fips202/keccakf1600.c | 31 +++++++++ proofs/cbmc/keccakf1600_permute/Makefile | 2 +- proofs/cbmc/keccakf1600_permute_c/Makefile | 66 +++++++++++++++++++ .../keccakf1600_permute_c_harness.c | 13 ++++ .../cbmc/keccakf1600_permute_native/Makefile | 2 +- .../cbmc/keccakf1600x4_extract_bytes/Makefile | 2 +- .../keccakf1600x4_extract_bytes_c/Makefile | 55 ++++++++++++++++ .../keccakf1600x4_extract_bytes_c_harness.c | 21 ++++++ .../Makefile | 2 +- proofs/cbmc/keccakf1600x4_xor_bytes/Makefile | 2 +- .../cbmc/keccakf1600x4_xor_bytes_c/Makefile | 55 ++++++++++++++++ .../keccakf1600x4_xor_bytes_c_harness.c | 22 +++++++ .../keccakf1600x4_xor_bytes_native/Makefile | 2 +- 13 files changed, 269 insertions(+), 6 deletions(-) create mode 100644 proofs/cbmc/keccakf1600_permute_c/Makefile create mode 100644 proofs/cbmc/keccakf1600_permute_c/keccakf1600_permute_c_harness.c create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_c/Makefile create mode 100644 proofs/cbmc/keccakf1600x4_extract_bytes_c/keccakf1600x4_extract_bytes_c_harness.c create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_c/Makefile create mode 100644 proofs/cbmc/keccakf1600x4_xor_bytes_c/keccakf1600x4_xor_bytes_c_harness.c diff --git a/mlkem/src/fips202/keccakf1600.c b/mlkem/src/fips202/keccakf1600.c index 23679f411d..928a3e0a19 100644 --- a/mlkem/src/fips202/keccakf1600.c +++ b/mlkem/src/fips202/keccakf1600.c @@ -86,6 +86,19 @@ static void mlk_keccakf1600x4_extract_bytes_c(uint64_t *state, unsigned char *data2, unsigned char *data3, unsigned offset, unsigned length) +__contract__( + requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) && + 0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset) + requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) + requires(memory_no_alias(data0, length)) + requires(memory_no_alias(data1, length)) + requires(memory_no_alias(data2, length)) + requires(memory_no_alias(data3, length)) + assigns(memory_slice(data0, length)) + assigns(memory_slice(data1, length)) + assigns(memory_slice(data2, length)) + assigns(memory_slice(data3, length)) +) { mlk_keccakf1600_extract_bytes(state + MLK_KECCAK_LANES * 0, data0, offset, length); @@ -120,6 +133,20 @@ static void mlk_keccakf1600x4_xor_bytes_c(uint64_t *state, const unsigned char *data2, const unsigned char *data3, unsigned offset, unsigned length) +__contract__( + requires(0 <= offset && offset <= MLK_KECCAK_LANES * sizeof(uint64_t) && + 0 <= length && length <= MLK_KECCAK_LANES * sizeof(uint64_t) - offset) + requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) + requires(memory_no_alias(data0, length)) + /* Case 1: all input buffers are distinct; Case 2: All input buffers are the same */ + requires((data0 == data1 && + data0 == data2 && + data0 == data3) || + (memory_no_alias(data1, length) && + memory_no_alias(data2, length) && + memory_no_alias(data3, length))) + assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES * MLK_KECCAK_WAY)) +) { mlk_keccakf1600_xor_bytes(state + MLK_KECCAK_LANES * 0, data0, offset, length); @@ -179,6 +206,10 @@ static const uint64_t mlk_KeccakF_RoundConstants[MLK_KECCAK_NROUNDS] = { MLK_STATIC_TESTABLE void mlk_keccakf1600_permute_c(uint64_t *state) +__contract__( + requires(memory_no_alias(state, sizeof(uint64_t) * MLK_KECCAK_LANES)) + assigns(memory_slice(state, sizeof(uint64_t) * MLK_KECCAK_LANES)) +) { unsigned round; diff --git a/proofs/cbmc/keccakf1600_permute/Makefile b/proofs/cbmc/keccakf1600_permute/Makefile index 2dfa7af0d4..e955f322c5 100644 --- a/proofs/cbmc/keccakf1600_permute/Makefile +++ b/proofs/cbmc/keccakf1600_permute/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600_permute -USE_FUNCTION_CONTRACTS= +USE_FUNCTION_CONTRACTS=mlk_keccakf1600_permute_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/keccakf1600_permute_c/Makefile b/proofs/cbmc/keccakf1600_permute_c/Makefile new file mode 100644 index 0000000000..a1b52b6bdb --- /dev/null +++ b/proofs/cbmc/keccakf1600_permute_c/Makefile @@ -0,0 +1,66 @@ +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccakf1600_permute_c_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = mlk_keccakf1600_permute_c + +DEFINES += +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c + +CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600_permute_c +USE_FUNCTION_CONTRACTS= +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--smt2 + +# For this proof we tell CBMC to +# - not decompose arrays into their individual cells +# - to slice constraints that are not in the cone of influence of the proof obligations +# These options simplify them modelling of arrays and produce much more compact +# SMT files, leaving all array-type reasoning to the SMT solver. +# +# For functions that use large and multi-dimensional arrays, this yields +# a substantial improvement in proof performance. +CBMCFLAGS += --no-array-field-sensitivity +CBMCFLAGS += --slice-formula + +FUNCTION_NAME = mlk_keccakf1600_permute_c + +# If this proof is found to consume huge amounts of RAM, you can set the +# EXPENSIVE variable. With new enough versions of the proof tools, this will +# restrict the number of EXPENSIVE CBMC jobs running at once. See the +# documentation in Makefile.common under the "Job Pools" heading for details. +# EXPENSIVE = true + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +# If you require access to a file-local ("static") function or object to conduct +# your proof, set the following (and do not include the original source file +# ("mlkem/src/poly.c") in PROJECT_SOURCES). +# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i +# include ../Makefile.common +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mlkem/src/poly.c +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz +# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must +# be set before including Makefile.common, but any use of variables on the +# left-hand side requires those variables to be defined. Hence, _SOURCE, +# _FUNCTIONS, _OBJECTS is set after including Makefile.common. + +include ../Makefile.common diff --git a/proofs/cbmc/keccakf1600_permute_c/keccakf1600_permute_c_harness.c b/proofs/cbmc/keccakf1600_permute_c/keccakf1600_permute_c_harness.c new file mode 100644 index 0000000000..31ba57a3d9 --- /dev/null +++ b/proofs/cbmc/keccakf1600_permute_c/keccakf1600_permute_c_harness.c @@ -0,0 +1,13 @@ +// Copyright (c) The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + +void mlk_keccakf1600_permute_c(uint64_t *state); + +void harness(void) +{ + uint64_t *s; + mlk_keccakf1600_permute_c(s); +} diff --git a/proofs/cbmc/keccakf1600_permute_native/Makefile b/proofs/cbmc/keccakf1600_permute_native/Makefile index 6fb2e2062d..0b77202e56 100644 --- a/proofs/cbmc/keccakf1600_permute_native/Makefile +++ b/proofs/cbmc/keccakf1600_permute_native/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600_permute -USE_FUNCTION_CONTRACTS=mlk_keccak_f1600_x1_native +USE_FUNCTION_CONTRACTS=mlk_keccak_f1600_x1_native mlk_keccakf1600_permute_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes/Makefile b/proofs/cbmc/keccakf1600x4_extract_bytes/Makefile index cf06c89374..03e6199fd1 100644 --- a/proofs/cbmc/keccakf1600x4_extract_bytes/Makefile +++ b/proofs/cbmc/keccakf1600x4_extract_bytes/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_extract_bytes -USE_FUNCTION_CONTRACTS=mlk_keccakf1600_extract_bytes +USE_FUNCTION_CONTRACTS=mlk_keccakf1600x4_extract_bytes_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_c/Makefile b/proofs/cbmc/keccakf1600x4_extract_bytes_c/Makefile new file mode 100644 index 0000000000..936fa431fd --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_extract_bytes_c/Makefile @@ -0,0 +1,55 @@ +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccakf1600x4_extract_bytes_c_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = mlk_keccakf1600x4_extract_bytes_c + +DEFINES += +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c + +CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_extract_bytes_c +USE_FUNCTION_CONTRACTS=mlk_keccakf1600_extract_bytes +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--smt2 + +FUNCTION_NAME = mlk_keccakf1600x4_extract_bytes_c + +# If this proof is found to consume huge amounts of RAM, you can set the +# EXPENSIVE variable. With new enough versions of the proof tools, this will +# restrict the number of EXPENSIVE CBMC jobs running at once. See the +# documentation in Makefile.common under the "Job Pools" heading for details. +# EXPENSIVE = true + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +# If you require access to a file-local ("static") function or object to conduct +# your proof, set the following (and do not include the original source file +# ("mlkem/src/poly.c") in PROJECT_SOURCES). +# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i +# include ../Makefile.common +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mlkem/src/poly.c +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz +# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must +# be set before including Makefile.common, but any use of variables on the +# left-hand side requires those variables to be defined. Hence, _SOURCE, +# _FUNCTIONS, _OBJECTS is set after including Makefile.common. + +include ../Makefile.common diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_c/keccakf1600x4_extract_bytes_c_harness.c b/proofs/cbmc/keccakf1600x4_extract_bytes_c/keccakf1600x4_extract_bytes_c_harness.c new file mode 100644 index 0000000000..66801051a7 --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_extract_bytes_c/keccakf1600x4_extract_bytes_c_harness.c @@ -0,0 +1,21 @@ +// Copyright (c) The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + +void mlk_keccakf1600x4_extract_bytes_c(uint64_t *state, unsigned char *data0, + unsigned char *data1, + unsigned char *data2, + unsigned char *data3, unsigned offset, + unsigned length); + +void harness(void) +{ + uint64_t *state; + unsigned char *data0, *data1, *data2, *data3; + unsigned offset; + unsigned length; + mlk_keccakf1600x4_extract_bytes_c(state, data0, data1, data2, data3, offset, + length); +} diff --git a/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile index 72c0b5117c..0526a67a87 100644 --- a/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile +++ b/proofs/cbmc/keccakf1600x4_extract_bytes_native/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_extract_bytes -USE_FUNCTION_CONTRACTS=mlk_keccakf1600_extract_bytes_x4_native +USE_FUNCTION_CONTRACTS=mlk_keccakf1600_extract_bytes_x4_native mlk_keccakf1600x4_extract_bytes_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes/Makefile b/proofs/cbmc/keccakf1600x4_xor_bytes/Makefile index fab0c891e4..5c92fc58d9 100644 --- a/proofs/cbmc/keccakf1600x4_xor_bytes/Makefile +++ b/proofs/cbmc/keccakf1600x4_xor_bytes/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_xor_bytes -USE_FUNCTION_CONTRACTS=mlk_keccakf1600_xor_bytes +USE_FUNCTION_CONTRACTS=mlk_keccakf1600x4_xor_bytes_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_c/Makefile b/proofs/cbmc/keccakf1600x4_xor_bytes_c/Makefile new file mode 100644 index 0000000000..371742bf06 --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_xor_bytes_c/Makefile @@ -0,0 +1,55 @@ +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = keccakf1600x4_xor_bytes_c_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = mlk_keccakf1600x4_xor_bytes_c + +DEFINES += +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c + +CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_xor_bytes_c +USE_FUNCTION_CONTRACTS=mlk_keccakf1600_xor_bytes +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--smt2 + +FUNCTION_NAME = mlk_keccakf1600x4_xor_bytes_c + +# If this proof is found to consume huge amounts of RAM, you can set the +# EXPENSIVE variable. With new enough versions of the proof tools, this will +# restrict the number of EXPENSIVE CBMC jobs running at once. See the +# documentation in Makefile.common under the "Job Pools" heading for details. +# EXPENSIVE = true + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +# If you require access to a file-local ("static") function or object to conduct +# your proof, set the following (and do not include the original source file +# ("mlkem/src/poly.c") in PROJECT_SOURCES). +# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i +# include ../Makefile.common +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mlkem/src/poly.c +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar +# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz +# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must +# be set before including Makefile.common, but any use of variables on the +# left-hand side requires those variables to be defined. Hence, _SOURCE, +# _FUNCTIONS, _OBJECTS is set after including Makefile.common. + +include ../Makefile.common diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_c/keccakf1600x4_xor_bytes_c_harness.c b/proofs/cbmc/keccakf1600x4_xor_bytes_c/keccakf1600x4_xor_bytes_c_harness.c new file mode 100644 index 0000000000..905e8a90de --- /dev/null +++ b/proofs/cbmc/keccakf1600x4_xor_bytes_c/keccakf1600x4_xor_bytes_c_harness.c @@ -0,0 +1,22 @@ +// Copyright (c) The mlkem-native project authors +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT-0 + +#include + + +void mlk_keccakf1600x4_xor_bytes_c(uint64_t *state, const unsigned char *data0, + const unsigned char *data1, + const unsigned char *data2, + const unsigned char *data3, unsigned offset, + unsigned length); + +void harness(void) +{ + uint64_t *state; + const unsigned char *data0, *data1, *data2, *data3; + unsigned offset; + unsigned length; + mlk_keccakf1600x4_xor_bytes_c(state, data0, data1, data2, data3, offset, + length); +} diff --git a/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile index c2d0743fa3..beeb9b85d4 100644 --- a/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile +++ b/proofs/cbmc/keccakf1600x4_xor_bytes_native/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mlkem/src/fips202/keccakf1600.c CHECK_FUNCTION_CONTRACTS=mlk_keccakf1600x4_xor_bytes -USE_FUNCTION_CONTRACTS=mlk_keccakf1600_xor_bytes_x4_native +USE_FUNCTION_CONTRACTS=mlk_keccakf1600_xor_bytes_x4_native mlk_keccakf1600x4_xor_bytes_c APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 From 8224a81041063d0a270844e56eb1ef4064ea701e Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Fri, 3 Apr 2026 22:11:38 +0800 Subject: [PATCH 02/27] CI: Update AWS-LC integration to current HEAD AWS-LC tests against their main branch have been failing since mlkem-native v1.1.0 has been merged into there. Remove patchs as upstream has absorbed all changes from the patchs. Update pinned AWS-LC commit to current main. Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/all.yml | 2 +- integration/aws-lc/post_import.patch | 28 ---------------------------- integration/aws-lc/pre_import.patch | 20 -------------------- 3 files changed, 1 insertion(+), 49 deletions(-) delete mode 100644 integration/aws-lc/post_import.patch delete mode 100644 integration/aws-lc/pre_import.patch diff --git a/.github/workflows/all.yml b/.github/workflows/all.yml index ee37224f19..52a3721319 100644 --- a/.github/workflows/all.yml +++ b/.github/workflows/all.yml @@ -75,7 +75,7 @@ jobs: needs: [ base ] uses: ./.github/workflows/integration-awslc.yml with: - commit: f3c6fff9a1a04183b5fbacdf55a2abf9a29c142f # main (2026-03-14) + commit: a75e930cecced7221631220475f2589335d4d67f # main (2026-04-03) secrets: inherit ct-test: name: Constant-time diff --git a/integration/aws-lc/post_import.patch b/integration/aws-lc/post_import.patch deleted file mode 100644 index c56d8d8931..0000000000 --- a/integration/aws-lc/post_import.patch +++ /dev/null @@ -1,28 +0,0 @@ -From dc1fb278d8d15bf931a805faf1298403c9c49c99 Mon Sep 17 00:00:00 2001 -From: Andreas Hatziiliou -Date: Mon, 12 Jan 2026 09:51:43 -0500 -Subject: [PATCH] post_import - ---- - crypto/fipsmodule/ml_kem/mlkem_native_config.h | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/crypto/fipsmodule/ml_kem/mlkem_native_config.h b/crypto/fipsmodule/ml_kem/mlkem_native_config.h -index 8b1ce254b..7b5a2228a 100644 ---- a/crypto/fipsmodule/ml_kem/mlkem_native_config.h -+++ b/crypto/fipsmodule/ml_kem/mlkem_native_config.h -@@ -78,8 +78,9 @@ static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) { - #include - #include "mlkem/sys.h" - #include --static MLK_INLINE void mlk_randombytes(void *ptr, size_t len) { -- AWSLC_ABORT_IF_NOT_ONE(RAND_bytes(ptr, len)); -+static MLK_INLINE int mlk_randombytes(void *ptr, size_t len) { -+ AWSLC_ABORT_IF_NOT_ONE(RAND_bytes(ptr, len)); -+ return 0; - } - #endif // !__ASSEMBLER__ - --- -2.52.0 - diff --git a/integration/aws-lc/pre_import.patch b/integration/aws-lc/pre_import.patch deleted file mode 100644 index 077cb3bd02..0000000000 --- a/integration/aws-lc/pre_import.patch +++ /dev/null @@ -1,20 +0,0 @@ -diff --git a/crypto/fipsmodule/ml_kem/importer.sh b/crypto/fipsmodule/ml_kem/importer.sh -index 24832ab..b81348a 100644 ---- a/crypto/fipsmodule/ml_kem/importer.sh -+++ b/crypto/fipsmodule/ml_kem/importer.sh -@@ -151,12 +151,14 @@ for file in $SRC/native/aarch64/src/*.S $SRC/native/x86_64/src/*.S; do - mv "$tmp_file" "$file" - - # Replace common.h include and assembly macros -- sed "${SED_I[@]}" 's/#include "\.\.\/\.\.\/\.\.\/common\.h"/#include "_internal_s2n_bignum.h"/' "$file" -+ s2n_header=$(if [[ "$file" == *"aarch64"* ]]; then echo "_internal_s2n_bignum_arm.h"; else echo "_internal_s2n_bignum_x86_att.h"; fi) -+ sed "${SED_I[@]}" "s/#include \"\.\.\/\.\.\/\.\.\/common\.h\"/#include \"$s2n_header\"/" "$file" - - func_name=$(grep -o '\.global MLK_ASM_NAMESPACE(\([^)]*\))' "$file" | sed 's/\.global MLK_ASM_NAMESPACE(\([^)]*\))/\1/') - if [ -n "$func_name" ]; then - sed "${SED_I[@]}" "s/\.global MLK_ASM_NAMESPACE($func_name)/ S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_$func_name)\n S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_$func_name)/" "$file" - sed "${SED_I[@]}" "s/MLK_ASM_FN_SYMBOL($func_name)/S2N_BN_SYMBOL(mlkem_$func_name):/" "$file" -+ sed "${SED_I[@]}" "s/MLK_ASM_FN_SIZE($func_name)/S2N_BN_SIZE_DIRECTIVE(mlkem_$func_name)/" "$file" - fi - done - From ba5f7b7f0ed65daa3b5a06e4464e337766328d25 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Sat, 4 Apr 2026 18:26:50 +0800 Subject: [PATCH 03/27] MAINTAINERS.md: Remove affiliations and chat I recently switched from Chelpis to zeroRISC meaning our MAINTAINERS.md is outdated. This commit removes affilations and Discord identifers from the file as they are unnecessary. Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- MAINTAINERS.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 0a93e0de1b..a246b98c3f 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -4,7 +4,7 @@ ## Active Maintainers -| Name | GitHub | Chat | Affiliation -|-------------------------|-------------------------------------------|----------------|---------------------- -| Hanno Becker | [hanno-becker](https://github.com/hanno-becker) | | AWS | -| Matthias J. Kannwischer | [mkannwischer](https://github.com/mkannwischer) | matthiaskannwischer | Chelpis Quantum Corp | +| Name | GitHub | +|-------------------------|-------------------------------------------------| +| Hanno Becker | [hanno-becker](https://github.com/hanno-becker) | +| Matthias J. Kannwischer | [mkannwischer](https://github.com/mkannwischer) | From 0ca9e895f1f767f0cf0df80057a447a8ecc89dcb Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Sat, 4 Apr 2026 19:34:39 +0800 Subject: [PATCH 04/27] CI: Add clang22 tests and constant-time tests Signed-off-by: Matthias J. Kannwischer Signed-off-by: Danny Tsen --- .github/workflows/ci.yml | 7 +++++++ .github/workflows/ct-tests.yml | 3 ++- flake.lock | 6 +++--- flake.nix | 4 ++++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6622147acb..28732278f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -305,6 +305,13 @@ jobs: c23: True opt: all examples: true + - name: clang-22 + shell: clang22 + darwin: True + c17: True + c23: True + opt: all + examples: true # CPU flags are not correctly passed to the zig assembler # https://github.com/ziglang/zig/issues/23576 # We therefore only test the C backend diff --git a/.github/workflows/ct-tests.yml b/.github/workflows/ct-tests.yml index 559853b484..d4faac03c9 100644 --- a/.github/workflows/ct-tests.yml +++ b/.github/workflows/ct-tests.yml @@ -28,6 +28,7 @@ jobs: - valgrind-varlat_clang19 - valgrind-varlat_clang20 - valgrind-varlat_clang21 + - valgrind-varlat_clang22 - valgrind-varlat_gcc48 - valgrind-varlat_gcc49 - valgrind-varlat_gcc7 @@ -64,7 +65,7 @@ jobs: valgrind_flags: --variable-latency-errors=yes - name: Build and run test (-Ofast) # -Ofast got deprecated in clang19; -O3 -ffast-math should be used instead - if: ${{ matrix.nix-shell != 'valgrind-varlat_clang19' && matrix.nix-shell != 'valgrind-varlat_clang20' && matrix.nix-shell != 'valgrind-varlat_clang21'}} + if: ${{ matrix.nix-shell != 'valgrind-varlat_clang19' && matrix.nix-shell != 'valgrind-varlat_clang20' && matrix.nix-shell != 'valgrind-varlat_clang21' && matrix.nix-shell != 'valgrind-varlat_clang22'}} uses: ./.github/actions/ct-test with: cflags: -Ofast -DMLK_CONFIG_KEYGEN_PCT diff --git a/flake.lock b/flake.lock index 0b017ab67c..dd7666e1dd 100644 --- a/flake.lock +++ b/flake.lock @@ -54,11 +54,11 @@ }, "nixpkgs-unstable": { "locked": { - "lastModified": 1770197578, - "narHash": "sha256-AYqlWrX09+HvGs8zM6ebZ1pwUqjkfpnv8mewYwAo+iM=", + "lastModified": 1775036866, + "narHash": "sha256-ZojAnPuCdy657PbTq5V0Y+AHKhZAIwSIT2cb8UgAz/U=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "00c21e4c93d963c50d4c0c89bfa84ed6e0694df2", + "rev": "6201e203d09599479a3b3450ed24fa81537ebc4e", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 02a55e76b3..f022fbc15f 100644 --- a/flake.nix +++ b/flake.nix @@ -52,6 +52,8 @@ inherit system; overlays = [ (_:_: { + clang_22 = pkgs-unstable.clang_22; + # From 24.05 (dropped in 25.11) gcc48 = pkgs-2405.gcc48; gcc49 = pkgs-2405.gcc49; @@ -179,6 +181,7 @@ devShells.clang19 = util.mkShellWithCC' pkgs.clang_19; devShells.clang20 = util.mkShellWithCC' pkgs.clang_20; devShells.clang21 = util.mkShellWithCC' pkgs.clang_21; + devShells.clang22 = util.mkShellWithCC' pkgs.clang_22; devShells.zig0_12 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_12); devShells.zig0_13 = util.mkShellWithCC' (zigWrapCC pkgs.zig_0_13); @@ -203,6 +206,7 @@ devShells.valgrind-varlat_clang19 = util.mkShellWithCC_valgrind' pkgs.clang_19; devShells.valgrind-varlat_clang20 = util.mkShellWithCC_valgrind' pkgs.clang_20; devShells.valgrind-varlat_clang21 = util.mkShellWithCC_valgrind' pkgs.clang_21; + devShells.valgrind-varlat_clang22 = util.mkShellWithCC_valgrind' pkgs.clang_22; devShells.valgrind-varlat_gcc48 = util.mkShellWithCC_valgrind' pkgs.gcc48; devShells.valgrind-varlat_gcc49 = util.mkShellWithCC_valgrind' pkgs.gcc49; devShells.valgrind-varlat_gcc7 = util.mkShellWithCC_valgrind' pkgs.gcc7; From 3da8401bb0492aba861918f164e8c08dcb98a61b Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 6 Apr 2026 13:28:11 -0400 Subject: [PATCH 05/27] Updated patch for ML_KEM for ppc64le supports p8 and above architectures and tested. 1. Run scripts/autogen and scripts/lint on Mac but not sure if it runs for ppc64le. 2. Run simpasm on Red Hat Linux. 3. Added detailed comments on NTT and INTT implementations. 4. Used C type symbols to improve readability. 5. Fixed some typos. Signed-off-by: Danny Tsen The following tests were run on p10. [09:28] danny@ltcden12-lp1 new_ppc64le_mlkem % ./scripts/tests func INFO > Functional Test Compile (native no_opt): make func OPT=0 AUTO=1 -j40 INFO > Functional Test ML-KEM-512 (native no_opt): make run_func_512 -j40 INFO > Functional Test ML-KEM-768 (native no_opt): make run_func_768 -j40 INFO > Functional Test ML-KEM-1024 (native no_opt): make run_func_1024 -j40 INFO > Functional Test Compile (native opt): make func OPT=1 AUTO=1 -j40 INFO > Functional Test ML-KEM-512 (native opt): make run_func_512 -j40 INFO > Functional Test ML-KEM-768 (native opt): make run_func_768 -j40 INFO > Functional Test ML-KEM-1024 (native opt): make run_func_1024 -j40 All good! [09:28] danny@ltcden12-lp1 new_ppc64le_mlkem % ./scripts/tests bench -c PERF INFO > Benchmark Compile (native no_opt): make bench OPT=0 AUTO=1 CYCLES=PERF -j40 INFO > Benchmark ML-KEM-512 (native no_opt): make run_bench_512 INFO > Benchmark ML-KEM-512 (native no_opt): test/build/mlkem512/bin/bench_mlkem512 keypair cycles = 66982 encaps cycles = 78820 decaps cycles = 100923 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 66438 66690 66791 66857 66920 66982 67043 67122 67218 67306 71905 encaps percentiles: 78322 78516 78618 78687 78752 78820 78878 78933 79012 79116 83825 decaps percentiles: 100427 100634 100733 100804 100869 100923 100985 101056 101131 101253 105852 INFO > Benchmark ML-KEM-768 (native no_opt): make run_bench_768 INFO > Benchmark ML-KEM-768 (native no_opt): test/build/mlkem768/bin/bench_mlkem768 keypair cycles = 111380 encaps cycles = 125891 decaps cycles = 154364 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 110575 110914 111083 111192 111291 111380 111496 111617 111821 112414 116725 encaps percentiles: 125081 125403 125526 125655 125776 125891 125998 126122 126293 126771 131358 decaps percentiles: 153575 153870 154008 154131 154261 154364 154487 154630 154782 155313 159863 INFO > Benchmark ML-KEM-1024 (native no_opt): make run_bench_1024 INFO > Benchmark ML-KEM-1024 (native no_opt): test/build/mlkem1024/bin/bench_mlkem1024 keypair cycles = 166809 encaps cycles = 185315 decaps cycles = 220229 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 165339 165995 166236 166435 166616 166809 167007 167200 167505 171058 175606 encaps percentiles: 183839 184563 184778 184951 185158 185315 185518 185744 186123 189637 192014 decaps percentiles: 218911 219430 219705 219841 220027 220229 220436 220673 221029 224484 226901 INFO > Benchmark Compile (native opt): make bench OPT=1 AUTO=1 CYCLES=PERF -j40 INFO > Benchmark ML-KEM-512 (native opt): make run_bench_512 INFO > Benchmark ML-KEM-512 (native opt): test/build/mlkem512/bin/bench_mlkem512 keypair cycles = 45750 encaps cycles = 50661 decaps cycles = 63561 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 45248 45469 45546 45620 45690 45750 45806 45886 45954 46063 50703 encaps percentiles: 50192 50367 50468 50542 50600 50661 50710 50771 50858 50954 55652 decaps percentiles: 63091 63276 63381 63436 63497 63561 63623 63679 63743 63857 68437 INFO > Benchmark ML-KEM-768 (native opt): make run_bench_768 INFO > Benchmark ML-KEM-768 (native opt): test/build/mlkem768/bin/bench_mlkem768 keypair cycles = 79045 encaps cycles = 86455 decaps cycles = 103878 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 78313 78578 78742 78847 78954 79045 79169 79285 79470 79978 84430 encaps percentiles: 85628 86009 86172 86272 86363 86455 86592 86711 86879 87292 92038 decaps percentiles: 103041 103399 103540 103676 103788 103878 103993 104104 104274 104736 109361 INFO > Benchmark ML-KEM-1024 (native opt): make run_bench_1024 INFO > Benchmark ML-KEM-1024 (native opt): test/build/mlkem1024/bin/bench_mlkem1024 keypair cycles = 124072 encaps cycles = 134500 decaps cycles = 157090 percentile 1 10 20 30 40 50 60 70 80 90 99 keypair percentiles: 122727 123259 123515 123720 123929 124072 124253 124527 125009 128466 133334 encaps percentiles: 133064 133681 133933 134129 134320 134500 134711 134933 135346 138753 141067 decaps percentiles: 155503 156261 156510 156694 156894 157090 157285 157605 158014 161592 166723 All good! Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 6 + dev/ppc64le/meta.h | 53 + dev/ppc64le/src/arith_native_ppc64le.h | 24 + dev/ppc64le/src/consts.c | 82 + dev/ppc64le/src/consts.h | 27 + dev/ppc64le/src/consts_intt.inc | 59 + dev/ppc64le/src/consts_ntt.inc | 59 + dev/ppc64le/src/intt_ppc.S | 828 ++++ dev/ppc64le/src/ntt_ppc.S | 653 ++++ dev/ppc64le/src/poly_tomont.S | 189 + dev/ppc64le/src/reduce.S | 236 ++ integration/liboqs/ML-KEM-1024_META.yml | 19 + integration/liboqs/ML-KEM-512_META.yml | 19 + integration/liboqs/ML-KEM-768_META.yml | 19 + integration/liboqs/config_ppc64le.h | 266 ++ mlkem/src/native/meta.h | 4 + mlkem/src/native/ppc64le/README.md | 6 + mlkem/src/native/ppc64le/meta.h | 53 + .../native/ppc64le/src/arith_native_ppc64le.h | 24 + mlkem/src/native/ppc64le/src/consts.c | 82 + mlkem/src/native/ppc64le/src/consts.h | 27 + mlkem/src/native/ppc64le/src/consts_intt.inc | 59 + mlkem/src/native/ppc64le/src/consts_ntt.inc | 59 + mlkem/src/native/ppc64le/src/intt_ppc.S | 3418 +++++++++++++++++ mlkem/src/native/ppc64le/src/ntt_ppc.S | 1791 +++++++++ mlkem/src/native/ppc64le/src/poly_tomont.S | 361 ++ mlkem/src/native/ppc64le/src/reduce.S | 713 ++++ test/mk/components.mk | 1 + 28 files changed, 9137 insertions(+) create mode 100644 dev/ppc64le/README.md create mode 100644 dev/ppc64le/meta.h create mode 100644 dev/ppc64le/src/arith_native_ppc64le.h create mode 100644 dev/ppc64le/src/consts.c create mode 100644 dev/ppc64le/src/consts.h create mode 100644 dev/ppc64le/src/consts_intt.inc create mode 100644 dev/ppc64le/src/consts_ntt.inc create mode 100644 dev/ppc64le/src/intt_ppc.S create mode 100644 dev/ppc64le/src/ntt_ppc.S create mode 100644 dev/ppc64le/src/poly_tomont.S create mode 100644 dev/ppc64le/src/reduce.S create mode 100644 integration/liboqs/config_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/README.md create mode 100644 mlkem/src/native/ppc64le/meta.h create mode 100644 mlkem/src/native/ppc64le/src/arith_native_ppc64le.h create mode 100644 mlkem/src/native/ppc64le/src/consts.c create mode 100644 mlkem/src/native/ppc64le/src/consts.h create mode 100644 mlkem/src/native/ppc64le/src/consts_intt.inc create mode 100644 mlkem/src/native/ppc64le/src/consts_ntt.inc create mode 100644 mlkem/src/native/ppc64le/src/intt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/ntt_ppc.S create mode 100644 mlkem/src/native/ppc64le/src/poly_tomont.S create mode 100644 mlkem/src/native/ppc64le/src/reduce.S diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 0000000000..34f8cbec66 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..aebb4711ab --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 0000000000..35c3e4b335 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 0000000000..c861ddec6c --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..d0203dd178 --- /dev/null +++ b/dev/ppc64le/src/consts_intt.inc @@ -0,0 +1,59 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + 677, 677, 677, 677, -1275, -1275, -1275, -1275, + 448, 448, 448, 448, -1065, -1065, -1065, -1065, + -1508, -1508, -1508, -1508, -725, -725, -725, -725, + -398, -398, -398, -398, 961, 961, 961, 961, + -247, -247, -247, -247, -951, -951, -951, -951, + 107, 107, 107, 107, -1421, -1421, -1421, -1421, + -271, -271, -271, -271, 830, 830, 830, 830, + -853, -853, -853, -853, -90, -90, -90, -90, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, + -1618, -1618, -1618, -1618, -1162, -1162, -1162, -1162, + -320, -320, -320, -320, -666, -666, -666, -666, + 516, 516, 516, 516, -8, -8, -8, -8, + -282, -282, -282, -282, -1544, -1544, -1544, -1544, + -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, + -552, -552, -552, -552, 1015, 1015, 1015, 1015, + 1223, 1223, 1223, 1223, 652, 652, 652, 652, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758 diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..2a0136f1e5 --- /dev/null +++ b/dev/ppc64le/src/consts_ntt.inc @@ -0,0 +1,59 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 652, 652, 652, 652, 1223, 1223, 1223, 1223, + 1015, 1015, 1015, 1015, -552, -552, -552, -552, + 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, + -1544, -1544, -1544, -1544, -282, -282, -282, -282, + -8, -8, -8, -8, 516, 516, 516, 516, + -666, -666, -666, -666, -320, -320, -320, -320, + -1162, -1162, -1162, -1162, -1618, -1618, -1618, -1618, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, + -90, -90, -90, -90, -853, -853, -853, -853, + 830, 830, 830, 830, -271, -271, -271, -271, + -1421, -1421, -1421, -1421, 107, 107, 107, 107, + -951, -951, -951, -951, -247, -247, -247, -247, + 961, 961, 961, 961, -398, -398, -398, -398, + -725, -725, -725, -725, -1508, -1508, -1508, -1508, + -1065, -1065, -1065, -1065, 448, 448, 448, 448, + -1275, -1275, -1275, -1275, 677, 677, 677, 677, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460 diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..3355118384 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc.S @@ -0,0 +1,828 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +.machine "any" +.text + +/* Barrett reduce constatnts */ +#define V20159 0 +#define V2pw25 1 +#define V_26 2 +#define V_MKQ 3 + +/* Montgomery reduce constatnts */ +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V1441 10 + +#define vdata_a1 21 +#define vdata_a2 22 +#define vdata_a3 23 +#define vdata_a4 24 +#define vdata_b1 8 +#define vdata_b2 12 +#define vdata_b3 16 +#define vdata_b4 20 + +#define vdata_brt1 8 +#define vdata_brt2 12 +#define vdata_brt3 16 +#define vdata_brt4 20 + +#define vdata_mont1 25 +#define vdata_mont2 26 +#define vdata_mont3 30 +#define vdata_mont4 31 + +#define vresult_brt1 4 +#define vresult_brt2 9 +#define vresult_brt3 13 +#define vresult_brt4 17 +#define vresult_mont1 13 +#define vresult_mont2 18 +#define vresult_mont3 23 +#define vresult_mont4 28 + +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Compute r[j] and r[j+len] from computed coefficients + * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barett reduce) + * r[j+len] - r[j]: V25, V26, V30, V31 (data for Montgomery reduce) + */ +.macro Compute_4Coeffs + vsubuhm vdata_mont1, vdata_b1, vdata_a1 + vsubuhm vdata_mont2, vdata_b2, vdata_a2 + vsubuhm vdata_mont3, vdata_b3, vdata_a3 + vsubuhm vdata_mont4, vdata_b4, vdata_a4 + vadduhm vdata_brt1, vdata_b1, vdata_a1 + vadduhm vdata_brt2, vdata_b2, vdata_a2 + vadduhm vdata_brt3, vdata_b3, vdata_a3 + vadduhm vdata_brt4, vdata_b4, vdata_a4 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coefficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * register used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next +.endm + +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+vdata_b1, rinp, b1_offset /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, rinp, a1_offset /* V21: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V22: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V23: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x 10, 0, dup_rinp + lxvd2x 11, 10, dup_rinp + xxpermdi 32+vdata_b1, 11, 10, 3 + xxpermdi 32+vdata_a1, 11, 10, 0 + lxvd2x 10, 11, dup_rinp + lxvd2x 11, 12, dup_rinp + xxpermdi 32+vdata_b2, 11, 10, 3 + xxpermdi 32+vdata_a2, 11, 10, 0 + lxvd2x 10, 15, dup_rinp + lxvd2x 11, 16, dup_rinp + xxpermdi 32+vdata_b3, 11, 10, 3 + xxpermdi 32+vdata_a3, 11, 10, 0 + lxvd2x 10, 17, dup_rinp + lxvd2x 11, 18, dup_rinp + xxpermdi 32+vdata_b4, 11, 10, 3 + xxpermdi 32+vdata_a4, 11, 10, 0 +.endm + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + /* Restore constant vectors + V_MKQ, V2pw25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh 6, vdata_brt1, V20159 + vmulesh 5, vdata_brt1, V20159 + vmulosh 11, vdata_brt2, V20159 + vmulesh 10, vdata_brt2, V20159 + vmulosh 15, vdata_brt3, V20159 + vmulesh 14, vdata_brt3, V20159 + vmulosh 19, vdata_brt4, V20159 + vmulesh 18, vdata_brt4, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V2pw25 + vadduwm 5, 5, V2pw25 + vadduwm 9, 9, V2pw25 + vadduwm 10, 10, V2pw25 + vadduwm 13, 13, V2pw25 + vadduwm 14, 14, V2pw25 + vadduwm 17, 17, V2pw25 + vadduwm 18, 18, V2pw25 + /* Right shift and pack lower halfword, + results bond to 2^16 in abs value */ + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + /* Modulo multify-Low unsigned halfword; + results bond to 2^16 * q in abs value. */ + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +/* + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 + /* Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, vdata_mont1, \_vz0, rinp + vmladduhm 20, vdata_mont2, \_vz1, rinp + vmladduhm 27, vdata_mont3, \_vz2, rinp + vmladduhm 28, vdata_mont4, \_vz3, rinp + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, vdata_mont1, \_vz0, rinp + vmhraddshs 19, vdata_mont2, \_vz1, rinp + vmhraddshs 24, vdata_mont3, \_vz2, rinp + vmhraddshs 29, vdata_mont4, \_vz3, rinp + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 27, V_QINV, 3 + vmladduhm 30, 28, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah \_vo0, 15, 4 + vsrah \_vo1, 20, 4 + vsrah \_vo2, 25, 4 + vsrah \_vo3, 30, 4 +.endm + +/* + * setup constant vectors for Montgmery multiplication + * V_NMKQ, V_QINV, Zero vector, One vector + */ +.macro Set_mont_consts + xxlor 32+5, 0, 0 /* V_NMKQ */ + xxlor 32+2, 2, 2 /* V_QINV */ + xxlor 32+3, 3, 3 /* all 0 */ + xxlor 32+4, 4, 4 /* all 1 */ +.endm + +.macro Load_next_4zetas + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 8, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 +.endm + +.macro Write_B4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, rinp, a1_offset + stxvd2x \_vs1, rinp, a2_offset + stxvd2x \_vs2, rinp, a3_offset + stxvd2x \_vs3, rinp, a4_offset +.endm + +.macro Write_M4C _vs0 _vs1 _vs2 _vs3 + stxvd2x \_vs0, rinp, b1_offset + stxvd2x \_vs1, rinp, b2_offset + stxvd2x \_vs2, rinp, b3_offset + stxvd2x \_vs3, rinp, b4_offset +.endm + +.macro Reload_4coeffs + lxvd2x 32+vdata_mont1, 0, rinp + lxvd2x 32+vdata_mont2, 10, rinp + lxvd2x 32+vdata_mont3, 11, rinp + lxvd2x 32+vdata_mont4, 12, rinp + addi rinp, rinp, 64 +.endm + +.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 + addi rinp, rinp, -128 + stxvd2x \_vs0, 0, rinp + stxvd2x \_vs1, 10, rinp + stxvd2x \_vs2, 11, rinp + stxvd2x \_vs3, 12, rinp + stxvd2x \_vs4, 15, rinp + stxvd2x \_vs5, 16, rinp + stxvd2x \_vs6, 17, rinp + stxvd2x \_vs7, 18, rinp + addi rinp, rinp, 128 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+vresult_mont1, 3 + xxpermdi 32+11, 32+14, 32+vresult_mont1, 0 + xxpermdi 32+12, 32+19, 32+vresult_mont2, 3 + xxpermdi 32+13, 32+19, 32+vresult_mont2, 0 + xxpermdi 32+14, 32+24, 32+vresult_mont3, 3 + xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 + xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 + xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, vresult_mont1, 14 + vmrgow 11, vresult_mont1, 14 + vmrgew 12, vresult_mont2, 19 + vmrgow 13, vresult_mont2, 19 + vmrgew 14, vresult_mont3, 24 + vmrgow 15, vresult_mont3, 24 + vmrgew 16, vresult_mont4, 29 + vmrgow 17, vresult_mont4, 29 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +/* + * INTT layer 1, Len=2. + */ +.macro INTT_REDUCE_L24 + Load_L24Coeffs + Compute_4Coeffs + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + PermWriteL24 +.endm + +/* + * INTT layer 2, Len=4. + */ +.macro INTT_REDUCE_L44 + Load_L44Coeffs + Compute_4Coeffs + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + PermWriteL44 +.endm + +/* + * INTT layer 3 and 4, Len=8 and 16. + */ +.macro INTT_REDUCE_4X start next + Load_4Coeffs \start, \next + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + Set_mont_consts + Load_next_4zetas + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 +.endm + +/* + * INTT layer 5, 6 and 7, Len=32, 64 and 128. + */ +.macro INTT_REDUCE_L567 start next + Load_4Coeffs \start, \next + BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + Set_mont_consts + lvx V_ZETA, 0, 14 + MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 +.endm + +/* + * mlk_intt_ppc(int16_t *r, int16_t *qdata) + * Compute inverse NTT based on the following 7 layers - + * len = 2, 4, 8, 16, 32, 64, 128 + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + */ +.global MLK_ASM_NAMESPACE(intt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(intt_ppc) + + SAVE_REGS + + /* init vectors and constants + Setup for Montgomery reduce */ + lxvx 0, 0, qinp + + li 10, QINV_OFFSET + lxvx 32+V_QINV, 10, qinp + xxlxor 32+3, 32+3, 32+3 + vspltish 4, 1 + xxlor 2, 32+2, 32+2 /* QINV */ + xxlor 3, 32+3, 32+3 /* 0 vector */ + xxlor 4, 32+4, 32+4 /* 1 vector */ + + /* Setup for Barrett reduce */ + li 10, Q_OFFSET + li 11, C20159_OFFSET + lxvx 6, 10, qinp /* V_MKQ */ + lxvx 32+V20159, 11, qinp /* V20159 */ + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ + + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V2pw25 store at vs7 */ + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + /* + * Montgomery reduce loops with constant 1441 + */ + addi zeta_inp, qinp, C1441_OFFSET + lvx V1441, 0, zeta_inp + li 8, 4 + mtctr 8 + + Set_mont_consts +intt_ppc__Loopf: + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + Reload_4coeffs + MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc__Loopf + + addi rinp, rinp, -512 + +.align 4 + /* + * Layer 1. len = 2 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Update zetas vectors, each vector has 2 zetas + * Load zeta vectors in 2-2-2-2 layout + */ + addi zeta_inp, qinp, ZETA_INTT_OFFSET + li len_2, 4 /* len * 2 */ + mr dup_rinp, rinp + + INTT_REDUCE_L24 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L24 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L24 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L24 + addi dup_rinp, dup_rinp, 128 + +.align 4 + /* + * Layer 2. len = 4 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 4-4 layout + */ + mr dup_rinp, rinp + li len_2, 8 + + INTT_REDUCE_L44 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L44 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L44 + addi dup_rinp, dup_rinp, 128 + INTT_REDUCE_L44 + addi dup_rinp, dup_rinp, 128 + +.align 4 + /* + * Layer 3. len = 8, start = 0, 128, 256, 384 + */ + li len_2, 16 + + INTT_REDUCE_4X 0, 32 + INTT_REDUCE_4X 128, 32 + INTT_REDUCE_4X 256, 32 + INTT_REDUCE_4X 384, 32 + +.align 4 + /* + * Layer 4. len = 16, start = 0, 16, 256, 272 + */ + li len_2, 32 + + INTT_REDUCE_4X 0, 64 + + addi zeta_inp, zeta_inp, -64 + INTT_REDUCE_4X 16, 64 + + INTT_REDUCE_4X 256, 64 + + addi zeta_inp, zeta_inp, -64 + INTT_REDUCE_4X 272, 64 + +.align 4 + /* + * Layer 5. len = 32, start = 0, 128, 256, 384 + */ + li len_2, 64 + + INTT_REDUCE_L567 0, 16 + addi zeta_inp, zeta_inp, 16 + INTT_REDUCE_L567 128, 16 + addi zeta_inp, zeta_inp, 16 + INTT_REDUCE_L567 256, 16 + addi zeta_inp, zeta_inp, 16 + INTT_REDUCE_L567 384, 16 + addi zeta_inp, zeta_inp, 16 + +.align 4 + /* + * Layer 6. len = 64, start = 0, 64, 256, 320 + */ + li len_2, 128 + + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + addi zeta_inp, zeta_inp, 16 + INTT_REDUCE_L567 256, 16 + INTT_REDUCE_L567 320, 16 + addi zeta_inp, zeta_inp, 16 + +.align 4 + /* + * Layer 7. len = 128, start = 0, 64, 128, 192 + */ + li len_2, 256 /* len*2 */ + + INTT_REDUCE_L567 0, 16 + INTT_REDUCE_L567 64, 16 + INTT_REDUCE_L567 128, 16 + INTT_REDUCE_L567 192, 16 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V2pw25 +#undef V_26 +#undef V_MKQ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V1441 +#undef vdata_a1 +#undef vdata_a2 +#undef vdata_a3 +#undef vdata_a4 +#undef vdata_b1 +#undef vdata_b2 +#undef vdata_b3 +#undef vdata_b4 +#undef vdata_brt1 +#undef vdata_brt2 +#undef vdata_brt3 +#undef vdata_brt4 +#undef vdata_mont1 +#undef vdata_mont2 +#undef vdata_mont3 +#undef vdata_mont4 +#undef vresult_brt1 +#undef vresult_brt2 +#undef vresult_brt3 +#undef vresult_brt4 +#undef vresult_mont1 +#undef vresult_mont2 +#undef vresult_mont3 +#undef vresult_mont4 +#undef rinp +#undef dup_rinp +#undef qinp +#undef len_2 +#undef zeta_inp +#undef a1_offset +#undef a2_offset +#undef a3_offset +#undef a4_offset +#undef b1_offset +#undef b2_offset +#undef b3_offset +#undef b4_offset + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..788c1cfd3f --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc.S @@ -0,0 +1,653 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_QINV 2 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 + +#define vdata_a1 12 +#define vdata_a2 17 +#define vdata_a3 22 +#define vdata_a4 27 +#define vdata_b1 13 +#define vdata_b2 18 +#define vdata_b3 23 +#define vdata_b4 28 + +#define vresult_a1 15 +#define vresult_b1 16 +#define vresult_a2 20 +#define vresult_b2 21 +#define vresult_a3 25 +#define vresult_b3 26 +#define vresult_a4 30 +#define vresult_b4 31 + +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + +.machine "any" +.text + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coefficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+vdata_b1, rinp, b1_offset /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V28: vector for r'3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start next + Init_Coeffs_offset \start \next + Load_4Rjp +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x 1, 0, dup_rinp + lxvd2x 2, 10, dup_rinp + xxpermdi 32+vdata_b1, 2, 1, 3 + xxpermdi 32+vdata_a1, 2, 1, 0 + lxvd2x 3, 11, dup_rinp + lxvd2x 4, 12, dup_rinp + xxpermdi 32+vdata_b2, 4, 3, 3 + xxpermdi 32+vdata_a2, 4, 3, 0 + lxvd2x 1, 15, dup_rinp + lxvd2x 2, 16, dup_rinp + xxpermdi 32+vdata_b3, 2, 1, 3 + xxpermdi 32+vdata_a3, 2, 1, 0 + lxvd2x 3, 17, dup_rinp + lxvd2x 4, 18, dup_rinp + xxpermdi 32+vdata_b4, 4, 3, 3 + xxpermdi 32+vdata_a4, 4, 3, 0 +.endm + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + * ----------------------------------- + * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + */ +.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 + /* fqmul = zeta * coefficient + Modular multification bond by 2^16 * q in abs value */ + vmladduhm 15, vdata_b1, \_vz0, rinp + vmladduhm 20, vdata_b2, \_vz1, rinp + vmladduhm 25, vdata_b3, \_vz2, rinp + vmladduhm 30, vdata_b4, \_vz3, rinp + + /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ + vmhraddshs 14, vdata_b1, \_vz0, rinp + vmhraddshs 19, vdata_b2, \_vz1, rinp + vmhraddshs 24, vdata_b3, \_vz2, rinp + vmhraddshs 29, vdata_b4, \_vz3, rinp + + vmladduhm 15, 15, V_QINV, rinp + vmladduhm 20, 20, V_QINV, rinp + vmladduhm 25, 25, V_QINV, rinp + vmladduhm 30, 30, V_QINV, rinp + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 30, 30, V_NMKQ, 29 + + /* Shift right 1 bit */ + vsrah vdata_b1, 15, 4 + vsrah vdata_b2, 20, 4 + vsrah vdata_b3, 25, 4 + vsrah vdata_b4, 30, 4 +.endm + +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+vdata_a1, rinp, a1_offset /* V12: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V17: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V22: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V27: vector r3 */ +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ +.macro Compute_4Coeffs + /* Since the result of the Montgomery multiplication is bounded + by q in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm vresult_b1, vdata_a1, vdata_b1 + vadduhm vresult_a1, vdata_b1, vdata_a1 + vsubuhm vresult_b2, vdata_a2, vdata_b2 + vadduhm vresult_a2, vdata_b2, vdata_a2 + vsubuhm vresult_b3, vdata_a3, vdata_b3 + vadduhm vresult_a3, vdata_b3, vdata_a3 + vsubuhm vresult_b4, vdata_a4, vdata_b4 + vadduhm vresult_a4, vdata_b4, vdata_a4 +.endm + +.macro Write_One + stxvd2x 32+vresult_a1, rinp, a1_offset + stxvd2x 32+vresult_b1, rinp, b1_offset + stxvd2x 32+vresult_a2, rinp, a2_offset + stxvd2x 32+vresult_b2, rinp, b2_offset + stxvd2x 32+vresult_a3, rinp, a3_offset + stxvd2x 32+vresult_b3, rinp, b3_offset + stxvd2x 32+vresult_a4, rinp, a4_offset + stxvd2x 32+vresult_b4, rinp, b4_offset +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+vresult_a1, 32+vresult_b1, 3 + xxpermdi 1, 32+vresult_a1, 32+vresult_b1, 0 + xxpermdi 2, 32+vresult_a2, 32+vresult_b2, 3 + xxpermdi 3, 32+vresult_a2, 32+vresult_b2, 0 + xxpermdi 4, 32+vresult_a3, 32+vresult_b3, 3 + xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 + xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 + xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 + stxvd2x 0, 0, dup_rinp + stxvd2x 1, 10, dup_rinp + stxvd2x 2, 11, dup_rinp + stxvd2x 3, 12, dup_rinp + stxvd2x 4, 15, dup_rinp + stxvd2x 5, 16, dup_rinp + stxvd2x 6, 17, dup_rinp + stxvd2x 7, 18, dup_rinp +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * coefficient array order. + */ +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, vresult_b1, vresult_a1 + vmrgow 11, vresult_b1, vresult_a1 + vmrgew 12, vresult_b2, vresult_a2 + vmrgow 13, vresult_b2, vresult_a2 + vmrgew 14, vresult_b3, vresult_a3 + vmrgow 15, vresult_b3, vresult_a3 + vmrgew 16, vresult_b4, vresult_a4 + vmrgow 17, vresult_b4, vresult_a4 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +.macro Load_next_4zetas + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 10, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + addi zeta_inp, zeta_inp, 64 +.endm + +/* + * NTT layer 7, Len=2. + */ +.macro NTT_REDUCE_L24 + Load_next_4zetas + Load_L24Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL24 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT layer 6, Len=4. + */ +.macro NTT_REDUCE_L44 + Load_next_4zetas + Load_L44Coeffs + MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + PermWriteL44 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT other layers, 1, 2, 3, 4, 5. + */ +.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 + Load_4Coeffs \start, \next + MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + Load_4Rj + Compute_4Coeffs + Write_One +.endm + +/* + * mlk_ntt_ppc(int16_t *r, int16_t *qdata) + * Compute forward NTT based on the following 7 layers - + * len = 128, 64, 32, 16, 8, 4, 2. + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coeffients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + * + */ +.global MLK_ASM_NAMESPACE(ntt_ppc) +.align 4 +MLK_ASM_FN_SYMBOL(ntt_ppc) + + SAVE_REGS + + /* load MLKEM_Q */ + lvx V_NMKQ,0,qinp + + /* Register 14 as pointer to zetas array */ + addi zeta_inp, qinp, ZETA_NTT_OFFSET + + vxor 3, 3, 3 + vspltish 4, 1 + + li 10, QINV_OFFSET + lvx V_QINV, 10, qinp + +.align 4 + /* + * Layer 1. len = 128, start= 0, 64, 128, 192 + */ + li len_2, 256 /* len * 2 */ + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * Layer 2. len = 64, start= 0, 64, 256, 320 + */ + li len_2, 128 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * Layer 3. len = 32, start = 0, 128, 256, 384 + */ + li len_2, 64 + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + + lvx V_ZETA, 0, zeta_inp + addi zeta_inp, zeta_inp, 16 + NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + +.align 4 + /* + * Layer 4. len = 16, start = 0, 16, 256, 272 + */ + li len_2, 32 + Load_next_4zetas + NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 + +.align 4 + /* + * Layer 5. len = 8, start= 0, 128, 256, 384 + */ + li len_2, 16 + Load_next_4zetas + NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + Load_next_4zetas + NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + + /* + * Layer 6. len = 4, + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 4-4 layout + */ + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + +.align 4 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + NTT_REDUCE_L44 + + /* + * Layer 7. len = 2 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 2-2-2-2 layout + */ + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 4 + +.align 4 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + NTT_REDUCE_L24 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_QINV +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef vdata_a1 +#undef vdata_a2 +#undef vdata_a3 +#undef vdata_a4 +#undef vdata_b1 +#undef vdata_b2 +#undef vdata_b3 +#undef vdata_b4 +#undef vresult_a1 +#undef vresult_b1 +#undef vresult_a2 +#undef vresult_b2 +#undef vresult_a3 +#undef vresult_b3 +#undef vresult_a4 +#undef vresult_b4 +#undef rinp +#undef dup_rinp +#undef qinp +#undef len_2 +#undef zeta_inp +#undef a1_offset +#undef a2_offset +#undef a3_offset +#undef a4_offset +#undef b1_offset +#undef b2_offset +#undef b3_offset +#undef b4_offset + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..877d9e65e7 --- /dev/null +++ b/dev/ppc64le/src/poly_tomont.S @@ -0,0 +1,189 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.machine "any" +.text + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + *----------------------------------- + * MREDUCE_4X(_v0, _v1, _v2, _v3) + */ + +.macro MREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 // >> 1 + vsrah \_v1, 20, 4 // >> 1 + vsrah \_v2, 25, 4 // >> 1 + vsrah \_v3, 9, 4 // >> 1 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, NQ_OFFSET + li 7, QINV_OFFSET + li 8, C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + MREDUCE_4X 27, 28, 29, 30 + MREDUCE_4X 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S new file mode 100644 index 0000000000..335beeb4fc --- /dev/null +++ b/dev/ppc64le/src/reduce.S @@ -0,0 +1,236 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * Copyright IBM Corp. 2025, 2026 + * + *=================================================================================== + * Written by Danny Tsen + * + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +#include "consts.h" + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ + +// Barrett reduce constatnts +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.machine "any" +.text + +.macro BREDUCE_4X _v0 _v1 _v2 _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +/* + * Conditional addition to get unsigned canonical representative + */ +.macro To_unsigned_16 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 +.endm + +.global MLK_ASM_NAMESPACE(reduce_ppc) +.balign 16 +MLK_ASM_FN_SYMBOL(reduce_ppc) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, Q_OFFSET + li 7, C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + + BREDUCE_4X 21, 22, 23, 24 + BREDUCE_4X 4, 9, 13, 17 + Write_8X + +.align 4 + // + // To unsigned canonical + // + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 7d8e50d4c6..9c7fe672ab 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index aa88537d3f..f46dbfdbf1 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 254d67478a..1b01c4d426 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,3 +89,22 @@ implementations: - Darwin required_flags: - asimd +- name: ppc64le + version: FIPS203 + folder_name: . + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair + signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand + signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc + signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec + sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h + mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h + mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c + mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h + mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h + mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h + mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le + supported_platforms: + - architecture: ppc64le + operating_systems: + - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h new file mode 100644 index 0000000000..2fa1cdbcf6 --- /dev/null +++ b/integration/liboqs/config_ppc64le.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [FIPS140_3_IG] + * Implementation Guidance for FIPS 140-3 and the Cryptographic Module + * Validation Program National Institute of Standards and Technology + * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements + */ + +#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H +#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H + +/****************************************************************************** + * Name: MLK_CONFIG_PARAMETER_SET + * + * Description: Specifies the parameter set for ML-KEM + * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 + * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 + * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#ifndef MLK_CONFIG_PARAMETER_SET +#define MLK_CONFIG_PARAMETER_SET \ + 768 /* Change this for different security strengths */ +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_NAMESPACE_PREFIX + * + * Description: The prefix to use to namespace global symbols from mlkem/. + * + * In a multi-level build (that is, if either + * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or + * - MLK_CONFIG_MULTILEVEL_NO_SHARED, + * are set, level-dependent symbols will additionally be prefixed + * with the parameter set (512/768/1024). + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#if MLK_CONFIG_PARAMETER_SET == 512 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 768 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE +#elif MLK_CONFIG_PARAMETER_SET == 1024 +#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE +#endif + +/****************************************************************************** + * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + * + * Description: Determines whether an native arithmetic backend should be used. + * + * The arithmetic backend covers performance critical functions + * such as the number-theoretic transform (NTT). + * + * If this option is unset, the C backend will be used. + * + * If this option is set, the arithmetic backend to be use is + * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is + * unset, the default backend for your the target architecture + * will be used. If set, it must be the name of a backend metadata + * file. + * + * This can also be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH + +/****************************************************************************** + * Name: MLK_CONFIG_ARITH_BACKEND_FILE + * + * Description: The arithmetic backend to use. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option + * is ignored. + * + * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must + * either be undefined or the filename of an arithmetic backend. + * If unset, the default backend will be used. + * + * This can be set using CFLAGS. + * + *****************************************************************************/ +#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ + "../../integration/liboqs/fips202_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER + * + * Description: Custom header to use for FIPS-202-X4 + * + * This should only be set if you intend to use a custom + * FIPS-202 implementation, different from the one shipped + * with mlkem-native. + * + * If set, it must be the name of a file serving as the + * replacement for mlkem/fips202/fips202x4.h, and exposing + * the same API (see FIPS202.md). + * + *****************************************************************************/ +/* +#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ + "../../integration/liboqs/fips202x4_glue.h" +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_ZEROIZE + * + * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes + * intermediate stack buffers before returning from function calls. + * + * Set this option and define `mlk_zeroize` if you want to + * use a custom method to zeroize intermediate stack buffers. + * The default implementation uses SecureZeroMemory on Windows + * and a memset + compiler barrier otherwise. If neither of those + * is available on the target platform, compilation will fail, + * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide + * a custom implementation of `mlk_zeroize()`. + * + * WARNING: + * The explicit stack zeroization conducted by mlkem-native + * reduces the likelihood of data leaking on the stack, but + * does not eliminate it! The C standard makes no guarantee about + * where a compiler allocates structures and whether/where it makes + * copies of them. Also, in addition to entire structures, there + * may also be potentially exploitable leakage of individual values + * on the stack. + * + * If you need bullet-proof zeroization of the stack, you need to + * consider additional measures instead of of what this feature + * provides. In this case, you can set mlk_zeroize to a no-op. + * + *****************************************************************************/ +/* #define MLK_CONFIG_CUSTOM_ZEROIZE + #if !defined(__ASSEMBLER__) + #include + #include "sys.h" + static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) + { + ... your implementation ... + } + #endif +*/ + +/****************************************************************************** + * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES + * + * Description: mlkem-native does not provide a secure randombytes + * implementation. Such an implementation has to provided by the + * consumer. + * + * If this option is not set, mlkem-native expects a function + * void randombytes(uint8_t *out, size_t outlen). + * + * Set this option and define `mlk_randombytes` if you want to + * use a custom method to sample randombytes with a different name + * or signature. + * + *****************************************************************************/ +#define MLK_CONFIG_CUSTOM_RANDOMBYTES +#if !defined(__ASSEMBLER__) +#include +#include +#include "../../mlkem/src/sys.h" +static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) +{ + OQS_randombytes(ptr, len); +} +#endif /* !__ASSEMBLER__ */ + +/****************************************************************************** + * Name: MLK_CONFIG_NO_ASM + * + * Description: If this option is set, mlkem-native will be built without + * use of native code or inline assembly. + * + * By default, inline assembly is used to implement value barriers. + * Without inline assembly, mlkem-native will use a global volatile + * 'opt blocker' instead; see verify.h. + * + * Inline assembly is also used to implement a secure zeroization + * function on non-Windows platforms. If this option is set and + * the target platform is not Windows, you MUST set + * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization + * function. + * + * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and + * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no + *native backends will be used. + * + *****************************************************************************/ +/* #define MLK_CONFIG_NO_ASM */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT + * + * Description: Compliance with @[FIPS140_3_IG, p.87] requires a + * Pairwise Consistency Test (PCT) to be carried out on a freshly + * generated keypair before it can be exported. + * + * Set this option if such a check should be implemented. + * In this case, crypto_kem_keypair_derand and crypto_kem_keypair + * will return a non-zero error code if the PCT failed. + * + * NOTE: This feature will drastically lower the performance of + * key generation. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT */ + +/****************************************************************************** + * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + * + * Description: If this option is set, the user must provide a runtime + * function `static inline int mlk_break_pct() { ... }` to + * indicate whether the PCT should be made fail. + * + * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. + * + *****************************************************************************/ +/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST + #if !defined(__ASSEMBLER__) + #include "sys.h" + static MLK_INLINE int mlk_break_pct(void) + { + ... return 0/1 depending on whether PCT should be broken ... + } + #endif +*/ + +/* Enable valgrind-based assertions in mlkem-native through macro + * from libOQS. */ +#if !defined(__ASSEMBLER__) +#include +#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) +#define MLK_CONFIG_CT_TESTING_ENABLED +#endif +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index 4291d629b1..dcd539ab13 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -22,4 +22,8 @@ #include "riscv64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 0000000000..5125a40eae --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. + diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 0000000000..54b3ddd9c6 --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ + mlk_ntt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ + mlk_intt_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ + mlk_reduce_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} + +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ + mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..116f6d7a6b --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2024-2025 The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) +void mlk_ntt_ppc(int16_t *, const int16_t *); + +#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) +void mlk_intt_ppc(int16_t *, const int16_t *); + +#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) +void mlk_reduce_ppc(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) +void mlk_poly_tomont_ppc(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 0000000000..35c3e4b335 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include +#include +#include +#include + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +MLK_ALIGN const int16_t mlk_ppc_qdata[] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + 1441, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" + , +/* zetas for invNTT */ +#include "consts_intt.inc" +}; +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 0000000000..90ad7b51cf --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define NQ_OFFSET 0 +#define QINV_OFFSET 16 +#define Q_OFFSET 32 +#define C20159_OFFSET 48 +#define C1441_OFFSET 64 +#define C1353_OFFSET 80 +#define ZETA_NTT_OFFSET 96 +#define ZETA_INTT_OFFSET 1104 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +extern const int16_t mlk_ppc_qdata[]; +#endif + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..d0203dd178 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -0,0 +1,59 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* + * For intt Len=2, offset IZETA_NTT_OFFSET127 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, + -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, + -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, + -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, + 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, + -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, + -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, + 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, + /* For intt Len=4 */ + 677, 677, 677, 677, -1275, -1275, -1275, -1275, + 448, 448, 448, 448, -1065, -1065, -1065, -1065, + -1508, -1508, -1508, -1508, -725, -725, -725, -725, + -398, -398, -398, -398, 961, 961, 961, 961, + -247, -247, -247, -247, -951, -951, -951, -951, + 107, 107, 107, 107, -1421, -1421, -1421, -1421, + -271, -271, -271, -271, 830, 830, 830, 830, + -853, -853, -853, -853, -90, -90, -90, -90, + 126, 126, 126, 126, 1469, 1469, 1469, 1469, + -1618, -1618, -1618, -1618, -1162, -1162, -1162, -1162, + -320, -320, -320, -320, -666, -666, -666, -666, + 516, 516, 516, 516, -8, -8, -8, -8, + -282, -282, -282, -282, -1544, -1544, -1544, -1544, + -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, + -552, -552, -552, -552, 1015, 1015, 1015, 1015, + 1223, 1223, 1223, 1223, 652, 652, 652, 652, + /* For intt Len=8 and others */ + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, + -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, + 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, + 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, + -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, + -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, + 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, + -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, + 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, + -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, + 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, + 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, + -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, + 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, + 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, + -758, -758, -758, -758, -758, -758 diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..2a0136f1e5 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -0,0 +1,59 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + /* For ntt Len=128, offset 96 */ + -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, + -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, + 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, + 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, + -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, + 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, + 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, + 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, + 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, + -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, + 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, + -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, + -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, + 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, + 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, + -1571, -1571, -1571, -1571, -1571, -1571, + /* For Len=4 */ + 652, 652, 652, 652, 1223, 1223, 1223, 1223, + 1015, 1015, 1015, 1015, -552, -552, -552, -552, + 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, + -1544, -1544, -1544, -1544, -282, -282, -282, -282, + -8, -8, -8, -8, 516, 516, 516, 516, + -666, -666, -666, -666, -320, -320, -320, -320, + -1162, -1162, -1162, -1162, -1618, -1618, -1618, -1618, + 1469, 1469, 1469, 1469, 126, 126, 126, 126, + -90, -90, -90, -90, -853, -853, -853, -853, + 830, 830, 830, 830, -271, -271, -271, -271, + -1421, -1421, -1421, -1421, 107, 107, 107, 107, + -951, -951, -951, -951, -247, -247, -247, -247, + 961, 961, 961, 961, -398, -398, -398, -398, + -725, -725, -725, -725, -1508, -1508, -1508, -1508, + -1065, -1065, -1065, -1065, 448, 448, 448, 448, + -1275, -1275, -1275, -1275, 677, 677, 677, 677, + /* + * For ntt Len=2 + * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) + * Transpose z[0], z[1], z[2], z[3] + * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] + */ + 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, + 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, + -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, + 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, + 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, + -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, + -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, + 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460 diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S new file mode 100644 index 0000000000..903f4eeffc --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -0,0 +1,3418 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(intt_ppc) +MLK_ASM_FN_SYMBOL(intt_ppc) + + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 52, 10, 1 + stxvx 53, 11, 1 + stxvx 54, 12, 1 + stxvx 55, 14, 1 + stxvx 56, 15, 1 + stxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 58, 10, 1 + stxvx 59, 11, 1 + stxvx 60, 12, 1 + stxvx 61, 14, 1 + stxvx 62, 15, 1 + stxvx 63, 16, 1 + lxvx 0, 0, 4 + li 10, 16 + lxvx 34, 10, 4 + xxlxor 35, 35, 35 + vspltish 4, 1 + xxlor 2, 34, 34 + xxlor 3, 35, 35 + xxlor 4, 36, 36 + li 10, 32 + li 11, 48 + lxvx 6, 10, 4 + lxvx 32, 11, 4 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 40, 40 + vspltisw 9, 1 + vsubuwm 10, 8, 9 + vslw 9, 9, 10 + xxlor 7, 41, 41 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + addi 14, 4, 64 + lvx 10, 0, 14 + li 8, 4 + mtctr 8 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + +Lintt_ppc__Loopf: + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 6, 15, 4 + vsrah 7, 20, 4 + vsrah 8, 25, 4 + vsrah 9, 30, 4 + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + addi 3, 3, -128 + stxvd2x 38, 0, 3 + stxvd2x 39, 10, 3 + stxvd2x 40, 11, 3 + stxvd2x 41, 12, 3 + stxvd2x 45, 15, 3 + stxvd2x 50, 16, 3 + stxvd2x 55, 17, 3 + stxvd2x 60, 18, 3 + addi 3, 3, 128 + bdnz Lintt_ppc__Loopf + addi 3, 3, -512 + nop + nop + addi 14, 4, 1104 + li 7, 4 + mr 5, 3 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + nop + mr 5, 3 + li 7, 8 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + nop + nop + li 7, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + nop + nop + nop + li 7, 32 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, -64 + li 9, 16 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, -64 + li 9, 272 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + vmladduhm 15, 25, 7, 3 + vmladduhm 20, 26, 8, 3 + vmladduhm 27, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 7, 3 + vmhraddshs 19, 26, 8, 3 + vmhraddshs 24, 30, 9, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + nop + li 7, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + nop + nop + nop + li 7, 128 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 320 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + nop + li 7, 256 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 192 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 34, 2, 2 + xxlor 35, 3, 3 + xxlor 36, 4, 4 + lvx 10, 0, 14 + vmladduhm 15, 25, 10, 3 + vmladduhm 20, 26, 10, 3 + vmladduhm 27, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmhraddshs 14, 25, 10, 3 + vmhraddshs 19, 26, 10, 3 + vmhraddshs 24, 30, 10, 3 + vmhraddshs 29, 31, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 27, 2, 3 + vmladduhm 30, 28, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 52, 10, 1 + lxvx 53, 11, 1 + lxvx 54, 12, 1 + lxvx 55, 14, 1 + lxvx 56, 15, 1 + lxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 58, 10, 1 + lxvx 59, 11, 1 + lxvx 60, 12, 1 + lxvx 61, 14, 1 + lxvx 62, 15, 1 + lxvx 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S new file mode 100644 index 0000000000..7344a1d419 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -0,0 +1,1791 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + * =================================================================================== + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(ntt_ppc) +MLK_ASM_FN_SYMBOL(ntt_ppc) + + .cfi_startproc + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 52, 10, 1 + stxvx 53, 11, 1 + stxvx 54, 12, 1 + stxvx 55, 14, 1 + stxvx 56, 15, 1 + stxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 58, 10, 1 + stxvx 59, 11, 1 + stxvx 60, 12, 1 + stxvx 61, 14, 1 + stxvx 62, 15, 1 + stxvx 63, 16, 1 + lvx 5, 0, 4 + addi 14, 4, 96 + vxor 3, 3, 3 + vspltish 4, 1 + li 10, 16 + lvx 2, 10, 4 + li 7, 256 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 192 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + li 7, 128 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 320 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + nop + li 7, 64 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + addi 14, 14, 16 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 10, 3 + vmladduhm 20, 18, 10, 3 + vmladduhm 25, 23, 10, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 10, 3 + vmhraddshs 19, 18, 10, 3 + vmhraddshs 24, 23, 10, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + nop + li 7, 32 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 16 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 272 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + nop + li 7, 16 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + mr 5, 3 + li 7, 8 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + nop + nop + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + mr 5, 3 + li 7, 4 + nop + nop + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + addi 14, 14, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmladduhm 15, 13, 7, 3 + vmladduhm 20, 18, 8, 3 + vmladduhm 25, 23, 9, 3 + vmladduhm 30, 28, 10, 3 + vmhraddshs 14, 13, 7, 3 + vmhraddshs 19, 18, 8, 3 + vmhraddshs 24, 23, 9, 3 + vmhraddshs 29, 28, 10, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 30, 30, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 30, 30, 5, 29 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 28, 30, 4 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 52, 10, 1 + lxvx 53, 11, 1 + lxvx 54, 12, 1 + lxvx 55, 14, 1 + lxvx 56, 15, 1 + lxvx 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 58, 10, 1 + lxvx 59, 11, 1 + lxvx 60, 12, 1 + lxvx 61, 14, 1 + lxvx 62, 15, 1 + lxvx 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + mtlr 0 + addi 1, 1, 352 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S new file mode 100644 index 0000000000..c0e29d5e04 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -0,0 +1,361 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * Copyright IBM Corp. 2025, 2026 + * + *=================================================================================== + * Written by Danny Tsen + * + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc) + + .cfi_startproc + stdu 1, -320(1) + mflr 0 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 52, 6, 1 + stxvx 53, 7, 1 + stxvx 54, 8, 1 + stxvx 55, 9, 1 + stxvx 56, 10, 1 + stxvx 57, 11, 1 + stxvx 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 59, 6, 1 + stxvx 60, 7, 1 + stxvx 61, 8, 1 + stxvx 62, 9, 1 + li 6, 0 + li 7, 16 + li 8, 80 + lxvx 37, 6, 4 + lxvx 34, 7, 4 + lxvx 32, 8, 4 + vxor 3, 3, 3 + vspltish 4, 1 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 52, 6, 1 + lxvx 53, 7, 1 + lxvx 54, 8, 1 + lxvx 55, 9, 1 + lxvx 56, 10, 1 + lxvx 57, 11, 1 + lxvx 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 59, 6, 1 + lxvx 60, 7, 1 + lxvx 61, 8, 1 + lxvx 62, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S new file mode 100644 index 0000000000..bf589c4e8f --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -0,0 +1,713 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + + +/* + * Copyright IBM Corp. 2025, 2026 + * + *=================================================================================== + * Written by Danny Tsen + * + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. + */ + + +.text +.balign 16 +.global MLK_ASM_NAMESPACE(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc) + + .cfi_startproc + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 52, 6, 1 + stxvx 53, 7, 1 + stxvx 54, 8, 1 + stxvx 55, 9, 1 + stxvx 56, 10, 1 + vxor 7, 7, 7 + li 6, 32 + li 7, 48 + lxvx 35, 6, 4 + lxvx 32, 7, 4 + vspltisw 2, 13 + vadduwm 2, 2, 2 + vspltisw 4, 1 + vsubuwm 5, 2, 4 + vslw 1, 4, 5 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + li 14, 16 + li 15, 32 + li 16, 48 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 52, 6, 1 + lxvx 53, 7, 1 + lxvx 54, 8, 1 + lxvx 55, 9, 1 + lxvx 56, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + .cfi_endproc + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/test/mk/components.mk b/test/mk/components.mk index af34a048e9..5c64ab4a8c 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -9,6 +9,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif From 8855d0d618675c18c606ff3cf25929200bccf05e Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 7 Apr 2026 04:40:24 -0400 Subject: [PATCH 06/27] 1. Update supporting system to p9 and above or Power Systems support ISA 2.07 and above. 2. Fixed typo, headers and return (MLK_MUST_CHECK_RETURN_VALUE). Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 5 +++-- dev/ppc64le/meta.h | 4 ++++ dev/ppc64le/src/arith_native_ppc64le.h | 2 +- dev/ppc64le/src/consts.c | 5 ----- dev/ppc64le/src/intt_ppc.S | 4 ++-- dev/ppc64le/src/poly_tomont.S | 8 ++++---- dev/ppc64le/src/reduce.S | 8 ++++---- mlkem/src/native/ppc64le/README.md | 6 +++--- mlkem/src/native/ppc64le/meta.h | 4 ++++ mlkem/src/native/ppc64le/src/arith_native_ppc64le.h | 2 +- mlkem/src/native/ppc64le/src/consts.c | 5 ----- 11 files changed, 26 insertions(+), 27 deletions(-) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md index 5125a40eae..57abddbd29 100644 --- a/dev/ppc64le/README.md +++ b/dev/ppc64le/README.md @@ -1,6 +1,7 @@ [//]: # (SPDX-License-Identifier: CC-BY-4.0) -# ppc64le backend (little endian) +/* ppc64le backend (little endian) */ -This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. +This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. +Or, Power systems supports ISA 2.07 and above. diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 34f8cbec66..e804e97c7a 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -25,24 +25,28 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { mlk_poly_tomont_ppc(data, mlk_ppc_qdata); diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index aebb4711ab..282b3566cd 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors + * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 35c3e4b335..48fe773ec8 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -3,11 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -#include -#include -#include -#include - #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 3355118384..6782b3b88a 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -20,13 +20,13 @@ .machine "any" .text -/* Barrett reduce constatnts */ +/* Barrett reduce constants */ #define V20159 0 #define V2pw25 1 #define V_26 2 #define V_MKQ 3 -/* Montgomery reduce constatnts */ +/* Montgomery reduce constants */ #define V_QINV 2 #define V_NMKQ 5 #define V_Z0 7 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 877d9e65e7..7089c55756 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -71,10 +71,10 @@ vmhraddshs 25, 25, V_NMKQ, 24 vmhraddshs 9, 9, V_NMKQ, 8 - vsrah \_v0, 15, 4 // >> 1 - vsrah \_v1, 20, 4 // >> 1 - vsrah \_v2, 25, 4 // >> 1 - vsrah \_v3, 9, 4 // >> 1 + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 .endm .macro Write_8X diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 335beeb4fc..6d23d54037 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -25,7 +25,7 @@ * Arguments: *r: pointer to input/output polynomial */ -// Barrett reduce constatnts +/* Barrett reduce constants */ #define V20159 0 #define V_25 1 #define V_26 2 @@ -190,9 +190,9 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) Write_8X .align 4 - // - // To unsigned canonical - // + /* + * To unsigned canonical + */ addi 3, 3, -512 vxor 9, 9, 9 vspltish 10, 15 diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md index 5125a40eae..522cbfb967 100644 --- a/mlkem/src/native/ppc64le/README.md +++ b/mlkem/src/native/ppc64le/README.md @@ -1,6 +1,6 @@ [//]: # (SPDX-License-Identifier: CC-BY-4.0) -# ppc64le backend (little endian) - -This directory contains a native backend for little endian POWER 8 (ppc64le) and above systems. +/* ppc64le backend (little endian) */ +This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. +Or, Power systems supports ISA 2.07 and above. diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 54b3ddd9c6..f7057051a4 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -25,24 +25,28 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } +MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { mlk_poly_tomont_ppc(data, mlk_ppc_qdata); diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index 116f6d7a6b..7ab3226c48 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025 The mlkem-native project authors + * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ #ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 35c3e4b335..48fe773ec8 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -3,11 +3,6 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ -#include -#include -#include -#include - #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ From 202a94c2b1adbc3b8c35692c5308c3d7fadc76ec Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 7 Apr 2026 05:25:59 -0400 Subject: [PATCH 07/27] Re-run autogen. Here is the output. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (venv) [9:51][@MacBookPro] mlkem_test/ % ./scripts/autogen ✓ Generate citations (0.2s) ✓ Generate OQS META.yml files (0.0s) – Generate SLOTHY optimized assembly (0.0s) ✓ Check assembly register aliases (0.1s) ✓ Check assembly loop labels (0.1s) ✓ Normalize assembly macro syntax (0.3s) ✓ Generate zeta and lookup tables (0.0s) ✓ Generate HOL Light assembly (1.7s) ✓ Synchronize backends (1.4s) ✓ Generate header guards (0.1s) ✓ Complete final backend synchronization (0.6s) – Update HOL Light bytecode (0.0s) ✓ Generate monolithic source files (1.6s) ✓ Generate undefs (1.4s) ✓ Generate test configs (0.0s) ✓ Check macro typos (0.3s) /Users/danny/my_repo/ws/docker_mlkem/mlkem_test/./scripts/autogen:500: PyparsingDeprecationWarning: 'parseString' deprecated - use 'parse_string' exp = self.parser.parseString(exp, parseAll=True).as_list()[0] ✓ Generate preprocessor comments (1.6s) ✓ Format files (2.2s) updated BIBLIOGRAPHY.md updated mlkem/src/native/ppc64le/src/reduce.S updated mlkem/src/native/ppc64le/src/poly_tomont.S updated dev/ppc64le/src/ntt_ppc.S updated dev/ppc64le/src/reduce.S updated mlkem/src/native/ppc64le/src/intt_ppc.S updated dev/ppc64le/src/poly_tomont.S updated mlkem/mlkem_native.c updated integration/liboqs/config_ppc64le.h updated dev/ppc64le/src/intt_ppc.S updated mlkem/src/native/ppc64le/src/ntt_ppc.S updated mlkem/mlkem_native_asm.S ✓ Finalize and write files (0.0s) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 0:00:11 Done ✓ Signed-off-by: Danny Tsen --- BIBLIOGRAPHY.md | 1 + dev/ppc64le/src/intt_ppc.S | 24 ++++++++++----------- dev/ppc64le/src/ntt_ppc.S | 14 ++++++------ dev/ppc64le/src/poly_tomont.S | 6 +++--- dev/ppc64le/src/reduce.S | 6 +++--- integration/liboqs/config_ppc64le.h | 3 ++- mlkem/mlkem_native.c | 25 ++++++++++++++++++++++ mlkem/mlkem_native_asm.S | 25 ++++++++++++++++++++++ mlkem/src/native/ppc64le/src/intt_ppc.S | 4 ++-- mlkem/src/native/ppc64le/src/ntt_ppc.S | 4 ++-- mlkem/src/native/ppc64le/src/poly_tomont.S | 4 ++-- mlkem/src/native/ppc64le/src/reduce.S | 4 ++-- 12 files changed, 86 insertions(+), 34 deletions(-) diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 231c5e5d26..3f2751394c 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -72,6 +72,7 @@ source code and documentation. - [examples/multilevel_build_native/mlkem_native/mlkem_native_config.h](examples/multilevel_build_native/mlkem_native/mlkem_native_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) + - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/mlkem_native_config.h](mlkem/mlkem_native_config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 6782b3b88a..dabc962437 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -190,7 +190,7 @@ * R21: offset to r'3 = r'2 + step * */ -.macro Init_Coeffs_offset start next +.macro Init_Coeffs_offset start, next li a1_offset, \start /* first offset to j */ add b1_offset, len_2, a1_offset /* J + len*2 */ addi a2_offset, a1_offset, \next @@ -226,8 +226,8 @@ * following order, * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 */ -.macro Load_4Coeffs start next - Init_Coeffs_offset \start \next +.macro Load_4Coeffs start, next + Init_Coeffs_offset \start, \next Load_4Rjp Compute_4Coeffs .endm @@ -302,7 +302,7 @@ xxpermdi 32+vdata_a4, 11, 10, 0 .endm -.macro BREDUCE_4X _v0 _v1 _v2 _v3 +.macro BREDUCE_4X _v0, _v1, _v2, _v3 /* Restore constant vectors V_MKQ, V2pw25 and V_26 */ vxor 7, 7, 7 @@ -365,7 +365,7 @@ * ----------------------------------- * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) */ -.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 _vo0 _vo1 _vo2 _vo3 +.macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3 /* Modular multification bond by 2^16 * q in abs value */ vmladduhm 15, vdata_mont1, \_vz0, rinp vmladduhm 20, vdata_mont2, \_vz1, rinp @@ -417,14 +417,14 @@ addi zeta_inp, zeta_inp, 64 .endm -.macro Write_B4C _vs0 _vs1 _vs2 _vs3 +.macro Write_B4C _vs0, _vs1, _vs2, _vs3 stxvd2x \_vs0, rinp, a1_offset stxvd2x \_vs1, rinp, a2_offset stxvd2x \_vs2, rinp, a3_offset stxvd2x \_vs3, rinp, a4_offset .endm -.macro Write_M4C _vs0 _vs1 _vs2 _vs3 +.macro Write_M4C _vs0, _vs1, _vs2, _vs3 stxvd2x \_vs0, rinp, b1_offset stxvd2x \_vs1, rinp, b2_offset stxvd2x \_vs2, rinp, b3_offset @@ -439,7 +439,7 @@ addi rinp, rinp, 64 .endm -.macro MWrite_8X _vs0 _vs1 _vs2 _vs3 _vs4 _vs5 _vs6 _vs7 +.macro MWrite_8X _vs0, _vs1, _vs2, _vs3, _vs4, _vs5, _vs6, _vs7 addi rinp, rinp, -128 stxvd2x \_vs0, 0, rinp stxvd2x \_vs1, 10, rinp @@ -543,7 +543,7 @@ /* * INTT layer 3 and 4, Len=8 and 16. */ -.macro INTT_REDUCE_4X start next +.macro INTT_REDUCE_4X start, next Load_4Coeffs \start, \next BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 @@ -556,7 +556,7 @@ /* * INTT layer 5, 6 and 7, Len=32, 64 and 128. */ -.macro INTT_REDUCE_L567 start next +.macro INTT_REDUCE_L567 start, next Load_4Coeffs \start, \next BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 @@ -824,5 +824,5 @@ intt_ppc__Loopf: #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 788c1cfd3f..0ce79bb836 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -156,7 +156,7 @@ * R21: offset to r'3 = r'2 + step * */ -.macro Init_Coeffs_offset start next +.macro Init_Coeffs_offset start, next li a1_offset, \start /* first offset to j */ add b1_offset, len_2, a1_offset /* J + len*2 */ addi a2_offset, a1_offset, \next @@ -183,8 +183,8 @@ * following order, * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 */ -.macro Load_4Coeffs start next - Init_Coeffs_offset \start \next +.macro Load_4Coeffs start, next + Init_Coeffs_offset \start, \next Load_4Rjp .endm @@ -266,7 +266,7 @@ * ----------------------------------- * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) */ -.macro MREDUCE_4X _vz0 _vz1 _vz2 _vz3 +.macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3 /* fqmul = zeta * coefficient Modular multification bond by 2^16 * q in abs value */ vmladduhm 15, vdata_b1, \_vz0, rinp @@ -426,7 +426,7 @@ /* * NTT other layers, 1, 2, 3, 4, 5. */ -.macro NTT_MREDUCE_4X start next _vz0 _vz1 _vz2 _vz3 +.macro NTT_MREDUCE_4X start, next, _vz0, _vz1, _vz2, _vz3 Load_4Coeffs \start, \next MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 Load_4Rj @@ -649,5 +649,5 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 7089c55756..0adf56e95a 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -41,7 +41,7 @@ * MREDUCE_4X(_v0, _v1, _v2, _v3) */ -.macro MREDUCE_4X _v0 _v1 _v2 _v3 +.macro MREDUCE_4X _v0, _v1, _v2, _v3 lxvd2x 32+13, 0, 3 addi 3, 3, 16 lxvd2x 32+18, 0, 3 @@ -185,5 +185,5 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) #undef V_NMKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 6d23d54037..e53eefeee6 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -34,7 +34,7 @@ .machine "any" .text -.macro BREDUCE_4X _v0 _v1 _v2 _v3 +.macro BREDUCE_4X _v0, _v1, _v2, _v3 lxvd2x 32+8, 0, 3 lxvd2x 32+12, 14, 3 lxvd2x 32+16, 15, 3 @@ -232,5 +232,5 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) #undef V_MKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h index 2fa1cdbcf6..4e8da63047 100644 --- a/integration/liboqs/config_ppc64le.h +++ b/integration/liboqs/config_ppc64le.h @@ -8,7 +8,8 @@ * * - [FIPS140_3_IG] * Implementation Guidance for FIPS 140-3 and the Cryptographic Module - * Validation Program National Institute of Standards and Technology + * Validation Program + * National Institute of Standards and Technology * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements */ diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index a00697e271..a3a3b33e71 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -519,6 +519,31 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_INTT_OFFSET +#undef ZETA_NTT_OFFSET +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index aba6e6bc4c..e056fdf7a5 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -540,6 +540,31 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_INTT_OFFSET +#undef ZETA_NTT_OFFSET +#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 903f4eeffc..6ddad4f3d6 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -3414,5 +3414,5 @@ Lintt_ppc__Loopf: blr .cfi_endproc -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 7344a1d419..6957a8bccb 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -1787,5 +1787,5 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) blr .cfi_endproc -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index c0e29d5e04..7b2acc08af 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -357,5 +357,5 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) blr .cfi_endproc -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index bf589c4e8f..634f9920b8 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -709,5 +709,5 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) blr .cfi_endproc -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + */ From 78f2037e2187b13b5cf2a6edaf8dd7b5a2e9298a Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 7 Apr 2026 10:50:56 -0400 Subject: [PATCH 08/27] Fixed more typos. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 6 +++--- dev/ppc64le/src/ntt_ppc.S | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index dabc962437..b5654354b4 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -453,7 +453,7 @@ .endm /* - * Transpose the final coefficients of 4-4 layout to the orginal + * Transpose the final coefficients of 4-4 layout to the original * coefficient array order. */ .macro PermWriteL44 @@ -480,7 +480,7 @@ .endm /* - * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * Transpose the final coefficients of 2-2-2-2 layout to the original * coefficient array order. */ .macro PermWriteL24 @@ -586,7 +586,7 @@ * * -> leg1 = leg1 + t, leg2 = leg1 - t * - * The resulting coeffients then store back to each leg's offset. + * The resulting coefficients then store back to each leg's offset. * * Each vector has the same corresponding zeta except len=4 and len=2. * diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 0ce79bb836..8d50ba4f7d 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -343,7 +343,7 @@ .endm /* - * Transpose the final coefficients of 4-4 layout to the orginal + * Transpose the final coefficients of 4-4 layout to the original * coefficient array order. */ .macro PermWriteL44 @@ -367,7 +367,7 @@ .endm /* - * Transpose the final coefficients of 2-2-2-2 layout to the orginal + * Transpose the final coefficients of 2-2-2-2 layout to the original * coefficient array order. */ .macro PermWriteL24 @@ -454,7 +454,7 @@ * * -> leg1 = leg1 + t, leg2 = leg1 - t * - * The resulting coeffients then store back to each leg's offset. + * The resulting coefficients then store back to each leg's offset. * * Each vector has the same corresponding zeta except len=4 and len=2. * From 8ae2ebf8c5806c643a3f7c54e0104e2c8b901462 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Wed, 8 Apr 2026 12:34:03 -0400 Subject: [PATCH 09/27] Fixed monolithic_build_multilevel_native functional test by manually added ppc64le assembly files and consts.c in mlkem_native_asm.S and mlkem_native.c since autogen did not added these files for ppc64le. Signed-off-by: Danny Tsen --- mlkem/mlkem_native.c | 3 +++ mlkem/mlkem_native_asm.S | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index a3a3b33e71..69a359c022 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -88,6 +88,9 @@ #include "src/native/riscv64/src/rv64v_debug.c" #include "src/native/riscv64/src/rv64v_poly.c" #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/consts.c" +#endif #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index e056fdf7a5..56da810871 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -95,6 +95,12 @@ #endif /* MLK_SYS_X86_64 */ #if defined(MLK_SYS_RISCV64) #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/intt_ppc.S" +#include "src/native/ppc64le/src/ntt_ppc.S" +#include "src/native/ppc64le/src/poly_tomont.S" +#include "src/native/ppc64le/src/reduce.S" +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) From ca4fd5878d2ce86de942a99eb7611119b5daed30 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 9 Apr 2026 01:53:20 -0400 Subject: [PATCH 10/27] Fixed markdown heading in README files. Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 2 +- mlkem/src/native/ppc64le/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md index 57abddbd29..6103a76646 100644 --- a/dev/ppc64le/README.md +++ b/dev/ppc64le/README.md @@ -1,6 +1,6 @@ [//]: # (SPDX-License-Identifier: CC-BY-4.0) -/* ppc64le backend (little endian) */ +# ppc64le backend (little endian) This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. Or, Power systems supports ISA 2.07 and above. diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md index 522cbfb967..7f29b4fa02 100644 --- a/mlkem/src/native/ppc64le/README.md +++ b/mlkem/src/native/ppc64le/README.md @@ -1,6 +1,6 @@ [//]: # (SPDX-License-Identifier: CC-BY-4.0) -/* ppc64le backend (little endian) */ +# ppc64le backend (little endian) This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. Or, Power systems supports ISA 2.07 and above. From d41c5c49989d9a772d4d86e7f9a575ded3107316 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Thu, 9 Apr 2026 14:14:55 +0200 Subject: [PATCH 11/27] Add autogen support for ppc64le Signed-off-by: Basil Hess Signed-off-by: Danny Tsen --- scripts/autogen | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/autogen b/scripts/autogen index 7a611b7911..1c48171aff 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2186,6 +2186,10 @@ def riscv64(c): return "/riscv64/" in c +def ppc64le(c): + return "/ppc64le/" in c + + def armv81m(c): return "/armv81m/" in c @@ -2231,12 +2235,17 @@ def native_arith_riscv64(c): return native_arith(c) and riscv64(c) +def native_arith_ppc64le(c): + return native_arith(c) and ppc64le(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) and not native_arith_riscv64(c) + and not native_arith_ppc64le(c) ) @@ -2345,6 +2354,11 @@ def gen_macro_undefs(extra_notes=None): filt=native_arith_riscv64, desc="native code (Arith, RISC-V 64)" ) yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + yield from gen_monolithic_undef_all_core( + filt=native_arith_ppc64le, desc="native code (Arith, PPC64LE)" + ) + yield "#endif" yield "#endif" yield "#endif" yield "" @@ -2427,6 +2441,10 @@ def gen_monolithic_source_file(): for c in filter(native_arith_riscv64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2515,6 +2533,10 @@ def gen_monolithic_asm_file(): for c in filter(native_arith_riscv64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" From d555371e854121c4f5f921cf1330d5d928ce82f8 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Fri, 10 Apr 2026 02:19:23 -0400 Subject: [PATCH 12/27] Merged with Basil's autogen and run autogen. Signed-off-by: Basil Hess Signed-off-by: Danny Tsen --- mlkem/mlkem_native.c | 55 ++++++++++++++++++++++------------------ mlkem/mlkem_native_asm.S | 55 ++++++++++++++++++++++------------------ 2 files changed, 60 insertions(+), 50 deletions(-) diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 69a359c022..9132cdda30 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -522,31 +522,6 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H -/* mlkem/src/native/ppc64le/meta.h */ -#undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT -#undef MLK_NATIVE_PPC64LE_META_H -#undef MLK_USE_NATIVE_INTT -#undef MLK_USE_NATIVE_NTT -#undef MLK_USE_NATIVE_POLY_REDUCE -#undef MLK_USE_NATIVE_POLY_TOMONT -/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ -#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#undef mlk_intt_ppc -#undef mlk_ntt_ppc -#undef mlk_poly_tomont_ppc -#undef mlk_reduce_ppc -/* mlkem/src/native/ppc64le/src/consts.h */ -#undef C1353_OFFSET -#undef C1441_OFFSET -#undef C20159_OFFSET -#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H -#undef NQ_OFFSET -#undef QINV_OFFSET -#undef Q_OFFSET -#undef ZETA_INTT_OFFSET -#undef ZETA_NTT_OFFSET -#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) @@ -684,5 +659,35 @@ #undef mlk_debug_check_bounds_int16m1 #undef mlk_debug_check_bounds_int16m2 #endif /* MLK_SYS_RISCV64 */ +#if defined(MLK_SYS_PPC64LE) +/* + * Undefine macros from native code (Arith, PPC64LE) + */ +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_INTT_OFFSET +#undef ZETA_NTT_OFFSET +#undef mlk_ppc_qdata +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index 56da810871..18588c0668 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -546,31 +546,6 @@ #undef MLK_NTT_BOUND /* mlkem/src/native/meta.h */ #undef MLK_NATIVE_META_H -/* mlkem/src/native/ppc64le/meta.h */ -#undef MLK_ARITH_BACKEND_NAME -#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT -#undef MLK_NATIVE_PPC64LE_META_H -#undef MLK_USE_NATIVE_INTT -#undef MLK_USE_NATIVE_NTT -#undef MLK_USE_NATIVE_POLY_REDUCE -#undef MLK_USE_NATIVE_POLY_TOMONT -/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ -#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#undef mlk_intt_ppc -#undef mlk_ntt_ppc -#undef mlk_poly_tomont_ppc -#undef mlk_reduce_ppc -/* mlkem/src/native/ppc64le/src/consts.h */ -#undef C1353_OFFSET -#undef C1441_OFFSET -#undef C20159_OFFSET -#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H -#undef NQ_OFFSET -#undef QINV_OFFSET -#undef Q_OFFSET -#undef ZETA_INTT_OFFSET -#undef ZETA_NTT_OFFSET -#undef mlk_ppc_qdata #if defined(MLK_SYS_AARCH64) /* * Undefine macros from native code (Arith, AArch64) @@ -708,5 +683,35 @@ #undef mlk_debug_check_bounds_int16m1 #undef mlk_debug_check_bounds_int16m2 #endif /* MLK_SYS_RISCV64 */ +#if defined(MLK_SYS_PPC64LE) +/* + * Undefine macros from native code (Arith, PPC64LE) + */ +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc +#undef mlk_ntt_ppc +#undef mlk_poly_tomont_ppc +#undef mlk_reduce_ppc +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef C1353_OFFSET +#undef C1441_OFFSET +#undef C20159_OFFSET +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef NQ_OFFSET +#undef QINV_OFFSET +#undef Q_OFFSET +#undef ZETA_INTT_OFFSET +#undef ZETA_NTT_OFFSET +#undef mlk_ppc_qdata +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ From ff708a152c69ede427a752b89abc22293a4ebcf6 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Fri, 10 Apr 2026 07:39:22 -0400 Subject: [PATCH 13/27] Have capability checked to support PPC_FEATURE2_ARCH_2_07 and above and fall back to default implementation if not. Signed-off-by: Danny Tsen --- dev/ppc64le/meta.h | 32 ++++++++++++++++++++++++++++++++ mlkem/src/native/ppc64le/meta.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index e804e97c7a..8df79b0e10 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -25,9 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" +#include + +static int mlkem_ppc_check_cap() +{ + static int ppc_inited = 0; + static int have_cap = 0; + + if (ppc_inited) + { + return have_cap; + } + have_cap = (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) ? 1 : 0; + ppc_inited = 1; + return have_cap; +} + MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -35,6 +55,10 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -42,6 +66,10 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -49,6 +77,10 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_poly_tomont_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index f7057051a4..f61d9340c9 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -25,9 +25,29 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" +#include + +static int mlkem_ppc_check_cap() +{ + static int ppc_inited = 0; + static int have_cap = 0; + + if (ppc_inited) + { + return have_cap; + } + have_cap = (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) ? 1 : 0; + ppc_inited = 1; + return have_cap; +} + MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -35,6 +55,10 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -42,6 +66,10 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -49,6 +77,10 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { + if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + { + return MLK_NATIVE_FUNC_FALLBACK; + } mlk_poly_tomont_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } From 4ac67f00103dfe3b743c9dea7819dae835ff581c Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Mon, 13 Apr 2026 02:54:40 -0400 Subject: [PATCH 14/27] Fixed parameter passed to getauxval and remove machine directive. Signed-off-by: Danny Tsen --- dev/ppc64le/meta.h | 8 ++++---- dev/ppc64le/src/intt_ppc.S | 1 - dev/ppc64le/src/ntt_ppc.S | 1 - dev/ppc64le/src/poly_tomont.S | 1 - dev/ppc64le/src/reduce.S | 1 - mlkem/src/native/ppc64le/meta.h | 8 ++++---- 6 files changed, 8 insertions(+), 12 deletions(-) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 8df79b0e10..df2f4b4789 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -44,7 +44,7 @@ static int mlkem_ppc_check_cap() MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -55,7 +55,7 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -66,7 +66,7 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -77,7 +77,7 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index b5654354b4..c82186a188 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -17,7 +17,6 @@ #include "consts.h" -.machine "any" .text /* Barrett reduce constants */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index 8d50ba4f7d..bff5c27b40 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -57,7 +57,6 @@ #define b3_offset 19 #define b4_offset 21 -.machine "any" .text .macro SAVE_REGS diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 0adf56e95a..31d57ca22f 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -29,7 +29,6 @@ #define V_QINV 2 #define V_NMKQ 5 -.machine "any" .text /* diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index e53eefeee6..31b5ee86d0 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -31,7 +31,6 @@ #define V_26 2 #define V_MKQ 3 -.machine "any" .text .macro BREDUCE_4X _v0, _v1, _v2, _v3 diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index f61d9340c9..8e726a2a34 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -44,7 +44,7 @@ static int mlkem_ppc_check_cap() MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -55,7 +55,7 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -66,7 +66,7 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } @@ -77,7 +77,7 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap(PPC_FEATURE2_ARCH_2_07)) + if (!mlkem_ppc_check_cap()) { return MLK_NATIVE_FUNC_FALLBACK; } From bad6a0e64c3d6fd556e955a3ed1950951c8f2eb9 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 30 Apr 2026 02:05:19 -0400 Subject: [PATCH 15/27] Remove capability check PPC_FEATURE2_ARCH_2_07. Signed-off-by: Danny Tsen --- dev/ppc64le/meta.h | 32 -------------------------------- mlkem/src/native/ppc64le/meta.h | 32 -------------------------------- 2 files changed, 64 deletions(-) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index df2f4b4789..e804e97c7a 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -25,29 +25,9 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -#include - -static int mlkem_ppc_check_cap() -{ - static int ppc_inited = 0; - static int have_cap = 0; - - if (ppc_inited) - { - return have_cap; - } - have_cap = (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) ? 1 : 0; - ppc_inited = 1; - return have_cap; -} - MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -55,10 +35,6 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -66,10 +42,6 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -77,10 +49,6 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_poly_tomont_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 8e726a2a34..f7057051a4 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -25,29 +25,9 @@ #include "../api.h" #include "src/arith_native_ppc64le.h" -#include - -static int mlkem_ppc_check_cap() -{ - static int ppc_inited = 0; - static int have_cap = 0; - - if (ppc_inited) - { - return have_cap; - } - have_cap = (getauxval(AT_HWCAP2) & PPC_FEATURE2_ARCH_2_07) ? 1 : 0; - ppc_inited = 1; - return have_cap; -} - MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_ntt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -55,10 +35,6 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_intt_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -66,10 +42,6 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_reduce_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } @@ -77,10 +49,6 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - if (!mlkem_ppc_check_cap()) - { - return MLK_NATIVE_FUNC_FALLBACK; - } mlk_poly_tomont_ppc(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } From 26d171793d6672a3d7f345fb984b7ba8791e06bd Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 5 May 2026 06:06:21 -0400 Subject: [PATCH 16/27] This patch fixed the following items, 1. Fixed typos and minor comments. 2. Removed IZETA_NTT_OFFSET127 in consts_intt.inc. 3. Fixed _asm subfix in backend name. 4. Fixed MLK_PPC_ prefix in constants. Manually fixed the backend names of all assembly files in mlkem/src/native/ppc64le/src/ to run the the test since the simpasm can not be run properly for ppc64le in my env. Signed-off-by: Danny Tsen --- dev/ppc64le/meta.h | 8 +-- dev/ppc64le/src/arith_native_ppc64le.h | 16 +++--- dev/ppc64le/src/consts.h | 16 +++--- dev/ppc64le/src/consts_intt.inc | 2 +- dev/ppc64le/src/intt_ppc.S | 50 +++++++++---------- dev/ppc64le/src/ntt_ppc.S | 34 ++++++------- dev/ppc64le/src/poly_tomont.S | 14 +++--- dev/ppc64le/src/reduce.S | 15 +++--- mlkem/mlkem_native.c | 24 ++++----- mlkem/mlkem_native_asm.S | 24 ++++----- mlkem/src/native/ppc64le/meta.h | 8 +-- .../native/ppc64le/src/arith_native_ppc64le.h | 16 +++--- mlkem/src/native/ppc64le/src/consts.h | 16 +++--- mlkem/src/native/ppc64le/src/consts_intt.inc | 2 +- mlkem/src/native/ppc64le/src/intt_ppc.S | 8 +-- mlkem/src/native/ppc64le/src/ntt_ppc.S | 4 +- mlkem/src/native/ppc64le/src/poly_tomont.S | 4 +- mlkem/src/native/ppc64le/src/reduce.S | 4 +- 18 files changed, 128 insertions(+), 137 deletions(-) diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index e804e97c7a..6692c06483 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -28,28 +28,28 @@ MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data, mlk_ppc_qdata); + mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data, mlk_ppc_qdata); + mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data, mlk_ppc_qdata); + mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h index 282b3566cd..5cf1c3b0f4 100644 --- a/dev/ppc64le/src/arith_native_ppc64le.h +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -9,16 +9,16 @@ #include "../../../common.h" #include "consts.h" -#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *, const int16_t *); +#define mlk_ntt_ppc_asm MLK_NAMESPACE(ntt_ppc_asm) +void mlk_ntt_ppc_asm(int16_t *, const int16_t *); -#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *, const int16_t *); +#define mlk_intt_ppc_asm MLK_NAMESPACE(intt_ppc_asm) +void mlk_intt_ppc_asm(int16_t *, const int16_t *); -#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r, const int16_t *); +#define mlk_reduce_ppc_asm MLK_NAMESPACE(reduce_ppc_asm) +void mlk_reduce_ppc_asm(int16_t *r, const int16_t *); -#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *, const int16_t *); +#define mlk_poly_tomont_ppc_asm MLK_NAMESPACE(poly_tomont_ppc_asm) +void mlk_poly_tomont_ppc_asm(int16_t *, const int16_t *); #endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index c861ddec6c..704a372b8a 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -9,14 +9,14 @@ /* Offsets into the constant table */ /* check-magic: off */ -#define NQ_OFFSET 0 -#define QINV_OFFSET 16 -#define Q_OFFSET 32 -#define C20159_OFFSET 48 -#define C1441_OFFSET 64 -#define C1353_OFFSET 80 -#define ZETA_NTT_OFFSET 96 -#define ZETA_INTT_OFFSET 1104 +#define MLK_PPC_NQ_OFFSET 0 +#define MLK_PPC_QINV_OFFSET 16 +#define MLK_PPC_Q_OFFSET 32 +#define MLK_PPC_C20159_OFFSET 48 +#define MLK_PPC_C1441_OFFSET 64 +#define MLK_PPC_C1353_OFFSET 80 +#define MLK_PPC_ZETA_NTT_OFFSET 96 +#define MLK_PPC_ZETA_INTT_OFFSET 1104 /* check-magic: on */ #ifndef __ASSEMBLER__ diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc index d0203dd178..7b0c6d9314 100644 --- a/dev/ppc64le/src/consts_intt.inc +++ b/dev/ppc64le/src/consts_intt.inc @@ -4,7 +4,7 @@ */ /* - * For intt Len=2, offset IZETA_NTT_OFFSET127 + * For intt Len=2, * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) * Transpose z[0], z[1], z[2], z[3] * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index c82186a188..1b4319e381 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * * =================================================================================== @@ -177,7 +175,7 @@ * * r7: len * 2, each coefficient component is 2 bytes. * - * register used for offset to coefficients, r[j] and r[j+len] + * registers used for offsets to coefficients, r[j] and r[j+len] * R9: offset to r0 = j * R16: offset to r1 = r0 + next * R18: offset to r2 = r1 + next @@ -335,7 +333,7 @@ vadduwm 17, 17, V2pw25 vadduwm 18, 18, V2pw25 /* Right shift and pack lower halfword, - results bond to 2^16 in abs value */ + results bound by 2^16 in abs value */ vsraw 4, 4, V_26 vsraw 5, 5, V_26 vsraw 9, 9, V_26 @@ -352,8 +350,8 @@ vsubuhm 13, 7, 13 vpkuwum 17, 18, 17 vsubuhm 17, 7, 17 - /* Modulo multify-Low unsigned halfword; - results bond to 2^16 * q in abs value. */ + /* Modulo multiply-Low unsigned halfword; + results bound by 2^16 * q in abs value. */ vmladduhm \_v0, 4, V_MKQ, 8 vmladduhm \_v1, 9, V_MKQ, 12 vmladduhm \_v2, 13, V_MKQ, 16 @@ -365,7 +363,7 @@ * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) */ .macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3 - /* Modular multification bond by 2^16 * q in abs value */ + /* Modular multiplication bound by 2^16 * q in abs value */ vmladduhm 15, vdata_mont1, \_vz0, rinp vmladduhm 20, vdata_mont2, \_vz1, rinp vmladduhm 27, vdata_mont3, \_vz2, rinp @@ -566,7 +564,7 @@ .endm /* - * mlk_intt_ppc(int16_t *r, int16_t *qdata) + * mlk_intt_ppc_asm(int16_t *r, int16_t *qdata) * Compute inverse NTT based on the following 7 layers - * len = 2, 4, 8, 16, 32, 64, 128 * @@ -599,9 +597,9 @@ * pre-arranged for the leg1 and leg2. After the computation, each vector needs * to transpose back to its original 4-4 or 2-2-2-2 layout. */ -.global MLK_ASM_NAMESPACE(intt_ppc) -.align 4 -MLK_ASM_FN_SYMBOL(intt_ppc) +.global MLK_ASM_NAMESPACE(intt_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(intt_ppc_asm) SAVE_REGS @@ -609,7 +607,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc) Setup for Montgomery reduce */ lxvx 0, 0, qinp - li 10, QINV_OFFSET + li 10, MLK_PPC_QINV_OFFSET lxvx 32+V_QINV, 10, qinp xxlxor 32+3, 32+3, 32+3 vspltish 4, 1 @@ -618,8 +616,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 4, 32+4, 32+4 /* 1 vector */ /* Setup for Barrett reduce */ - li 10, Q_OFFSET - li 11, C20159_OFFSET + li 10, MLK_PPC_Q_OFFSET + li 11, MLK_PPC_C20159_OFFSET lxvx 6, 10, qinp /* V_MKQ */ lxvx 32+V20159, 11, qinp /* V20159 */ @@ -643,23 +641,23 @@ MLK_ASM_FN_SYMBOL(intt_ppc) /* * Montgomery reduce loops with constant 1441 */ - addi zeta_inp, qinp, C1441_OFFSET + addi zeta_inp, qinp, MLK_PPC_C1441_OFFSET lvx V1441, 0, zeta_inp li 8, 4 mtctr 8 Set_mont_consts -intt_ppc__Loopf: +intt_ppc_asm_Loopf: Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 Reload_4coeffs MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 - bdnz intt_ppc__Loopf + bdnz intt_ppc_asm_Loopf addi rinp, rinp, -512 -.align 4 +.align 16 /* * Layer 1. len = 2 * leg1 offset - 0, 32, 64, 96 @@ -668,7 +666,7 @@ intt_ppc__Loopf: * Update zetas vectors, each vector has 2 zetas * Load zeta vectors in 2-2-2-2 layout */ - addi zeta_inp, qinp, ZETA_INTT_OFFSET + addi zeta_inp, qinp, MLK_PPC_ZETA_INTT_OFFSET li len_2, 4 /* len * 2 */ mr dup_rinp, rinp @@ -681,7 +679,7 @@ intt_ppc__Loopf: INTT_REDUCE_L24 addi dup_rinp, dup_rinp, 128 -.align 4 +.balign 16 /* * Layer 2. len = 4 * leg1 offset - 0, 32, 64, 96 @@ -701,7 +699,7 @@ intt_ppc__Loopf: INTT_REDUCE_L44 addi dup_rinp, dup_rinp, 128 -.align 4 +.balign 16 /* * Layer 3. len = 8, start = 0, 128, 256, 384 */ @@ -712,7 +710,7 @@ intt_ppc__Loopf: INTT_REDUCE_4X 256, 32 INTT_REDUCE_4X 384, 32 -.align 4 +.balign 16 /* * Layer 4. len = 16, start = 0, 16, 256, 272 */ @@ -728,7 +726,7 @@ intt_ppc__Loopf: addi zeta_inp, zeta_inp, -64 INTT_REDUCE_4X 272, 64 -.align 4 +.balign 16 /* * Layer 5. len = 32, start = 0, 128, 256, 384 */ @@ -743,7 +741,7 @@ intt_ppc__Loopf: INTT_REDUCE_L567 384, 16 addi zeta_inp, zeta_inp, 16 -.align 4 +.balign 16 /* * Layer 6. len = 64, start = 0, 64, 256, 320 */ @@ -756,7 +754,7 @@ intt_ppc__Loopf: INTT_REDUCE_L567 320, 16 addi zeta_inp, zeta_inp, 16 -.align 4 +.balign 16 /* * Layer 7. len = 128, start = 0, 64, 128, 192 */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc.S index bff5c27b40..e9dae73ac8 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * * =================================================================================== @@ -267,7 +265,7 @@ */ .macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3 /* fqmul = zeta * coefficient - Modular multification bond by 2^16 * q in abs value */ + Modular multiplication bound by 2^16 * q in abs value */ vmladduhm 15, vdata_b1, \_vz0, rinp vmladduhm 20, vdata_b2, \_vz1, rinp vmladduhm 25, vdata_b3, \_vz2, rinp @@ -434,7 +432,7 @@ .endm /* - * mlk_ntt_ppc(int16_t *r, int16_t *qdata) + * mlk_ntt_ppc_asm(int16_t *r, int16_t *qdata) * Compute forward NTT based on the following 7 layers - * len = 128, 64, 32, 16, 8, 4, 2. * @@ -468,25 +466,25 @@ * to transpose back to its original 4-4 or 2-2-2-2 layout. * */ -.global MLK_ASM_NAMESPACE(ntt_ppc) -.align 4 -MLK_ASM_FN_SYMBOL(ntt_ppc) +.global MLK_ASM_NAMESPACE(ntt_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(ntt_ppc_asm) SAVE_REGS - /* load MLKEM_Q */ + /* load -MLKEM_Q */ lvx V_NMKQ,0,qinp /* Register 14 as pointer to zetas array */ - addi zeta_inp, qinp, ZETA_NTT_OFFSET + addi zeta_inp, qinp, MLK_PPC_ZETA_NTT_OFFSET vxor 3, 3, 3 vspltish 4, 1 - li 10, QINV_OFFSET + li 10, MLK_PPC_QINV_OFFSET lvx V_QINV, 10, qinp -.align 4 +.balign 16 /* * Layer 1. len = 128, start= 0, 64, 128, 192 */ @@ -499,7 +497,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA -.align 4 +.balign 16 /* * Layer 2. len = 64, start= 0, 64, 256, 320 */ @@ -514,7 +512,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA -.align 4 +.balign 16 /* * Layer 3. len = 32, start = 0, 128, 256, 384 */ @@ -535,7 +533,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) addi zeta_inp, zeta_inp, 16 NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA -.align 4 +.balign 16 /* * Layer 4. len = 16, start = 0, 16, 256, 272 */ @@ -548,7 +546,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 -.align 4 +.balign 16 /* * Layer 5. len = 8, start= 0, 128, 256, 384 */ @@ -583,7 +581,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) li 17, 96 li 18, 112 -.align 4 +.balign 16 NTT_REDUCE_L44 NTT_REDUCE_L44 NTT_REDUCE_L44 @@ -599,7 +597,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc) mr dup_rinp, rinp /* Let r5 points to coefficient array */ li len_2, 4 -.align 4 +.balign 16 NTT_REDUCE_L24 NTT_REDUCE_L24 NTT_REDUCE_L24 diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont.S index 31d57ca22f..9b35eb8dc0 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * *=================================================================================== @@ -87,9 +85,9 @@ stxvd2x 32+7, 11, 3 .endm -.global MLK_ASM_NAMESPACE(poly_tomont_ppc) +.global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) .balign 16 -MLK_ASM_FN_SYMBOL(poly_tomont_ppc) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) stdu 1, -320(1) mflr 0 @@ -116,9 +114,9 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc) stxvx 32+29, 8, 1 stxvx 32+30, 9, 1 - li 6, NQ_OFFSET - li 7, QINV_OFFSET - li 8, C1353_OFFSET + li 6, MLK_PPC_NQ_OFFSET + li 7, MLK_PPC_QINV_OFFSET + li 8, MLK_PPC_C1353_OFFSET lxvx 32+V_NMKQ, 6, 4 lxvx 32+V_QINV, 7, 4 lxvx 32+V1353, 8, 4 diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce.S index 31b5ee86d0..a560191f9f 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce.S @@ -1,10 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - - -/* + * * Copyright IBM Corp. 2025, 2026 * *=================================================================================== @@ -127,9 +124,9 @@ stxvd2x 32+0, 9, 3 .endm -.global MLK_ASM_NAMESPACE(reduce_ppc) +.global MLK_ASM_NAMESPACE(reduce_ppc_asm) .balign 16 -MLK_ASM_FN_SYMBOL(reduce_ppc) +MLK_ASM_FN_SYMBOL(reduce_ppc_asm) stdu 1, -224(1) mflr 0 std 14, 96(1) @@ -148,8 +145,8 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) vxor 7, 7, 7 - li 6, Q_OFFSET - li 7, C20159_OFFSET + li 6, MLK_PPC_Q_OFFSET + li 7, MLK_PPC_C20159_OFFSET lxvx 32+V_MKQ, 6, 4 lxvx 32+V20159, 7, 4 @@ -188,7 +185,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc) BREDUCE_4X 4, 9, 13, 17 Write_8X -.align 4 +.balign 16 /* * To unsigned canonical */ diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 9132cdda30..1aa1a1236e 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -673,20 +673,20 @@ #undef MLK_USE_NATIVE_POLY_TOMONT /* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ #undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#undef mlk_intt_ppc -#undef mlk_ntt_ppc -#undef mlk_poly_tomont_ppc -#undef mlk_reduce_ppc +#undef mlk_intt_ppc_asm +#undef mlk_ntt_ppc_asm +#undef mlk_poly_tomont_ppc_asm +#undef mlk_reduce_ppc_asm /* mlkem/src/native/ppc64le/src/consts.h */ -#undef C1353_OFFSET -#undef C1441_OFFSET -#undef C20159_OFFSET #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H -#undef NQ_OFFSET -#undef QINV_OFFSET -#undef Q_OFFSET -#undef ZETA_INTT_OFFSET -#undef ZETA_NTT_OFFSET +#undef MLK_PPC_C1353_OFFSET +#undef MLK_PPC_C1441_OFFSET +#undef MLK_PPC_C20159_OFFSET +#undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_QINV_OFFSET +#undef MLK_PPC_Q_OFFSET +#undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_NTT_OFFSET #undef mlk_ppc_qdata #endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index 18588c0668..74bc7fb467 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -697,20 +697,20 @@ #undef MLK_USE_NATIVE_POLY_TOMONT /* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ #undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H -#undef mlk_intt_ppc -#undef mlk_ntt_ppc -#undef mlk_poly_tomont_ppc -#undef mlk_reduce_ppc +#undef mlk_intt_ppc_asm +#undef mlk_ntt_ppc_asm +#undef mlk_poly_tomont_ppc_asm +#undef mlk_reduce_ppc_asm /* mlkem/src/native/ppc64le/src/consts.h */ -#undef C1353_OFFSET -#undef C1441_OFFSET -#undef C20159_OFFSET #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H -#undef NQ_OFFSET -#undef QINV_OFFSET -#undef Q_OFFSET -#undef ZETA_INTT_OFFSET -#undef ZETA_NTT_OFFSET +#undef MLK_PPC_C1353_OFFSET +#undef MLK_PPC_C1441_OFFSET +#undef MLK_PPC_C20159_OFFSET +#undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_QINV_OFFSET +#undef MLK_PPC_Q_OFFSET +#undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_NTT_OFFSET #undef mlk_ppc_qdata #endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index f7057051a4..4b8fa13232 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -28,28 +28,28 @@ MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { - mlk_ntt_ppc(data, mlk_ppc_qdata); + mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { - mlk_intt_ppc(data, mlk_ppc_qdata); + mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { - mlk_reduce_ppc(data, mlk_ppc_qdata); + mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { - mlk_poly_tomont_ppc(data, mlk_ppc_qdata); + mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; } #endif /* !__ASSEMBLER__ */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h index 7ab3226c48..3bd47ebd76 100644 --- a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -9,16 +9,16 @@ #include "../../../common.h" #include "consts.h" -#define mlk_ntt_ppc MLK_NAMESPACE(ntt_ppc) -void mlk_ntt_ppc(int16_t *, const int16_t *); +#define mlk_ntt_ppc_asm MLK_NAMESPACE(ntt_ppc_asm) +void mlk_ntt_ppc_asm(int16_t *, const int16_t *); -#define mlk_intt_ppc MLK_NAMESPACE(intt_ppc) -void mlk_intt_ppc(int16_t *, const int16_t *); +#define mlk_intt_ppc_asm MLK_NAMESPACE(intt_ppc_asm) +void mlk_intt_ppc_asm(int16_t *, const int16_t *); -#define mlk_reduce_ppc MLK_NAMESPACE(reduce_ppc) -void mlk_reduce_ppc(int16_t *r, const int16_t *); +#define mlk_reduce_ppc_asm MLK_NAMESPACE(reduce_ppc_asm) +void mlk_reduce_ppc_asm(int16_t *r, const int16_t *); -#define mlk_poly_tomont_ppc MLK_NAMESPACE(poly_tomont_ppc) -void mlk_poly_tomont_ppc(int16_t *, const int16_t *); +#define mlk_poly_tomont_ppc_asm MLK_NAMESPACE(poly_tomont_ppc_asm) +void mlk_poly_tomont_ppc_asm(int16_t *, const int16_t *); #endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 90ad7b51cf..de04ea2191 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -9,14 +9,14 @@ /* Offsets into the constant table */ /* check-magic: off */ -#define NQ_OFFSET 0 -#define QINV_OFFSET 16 -#define Q_OFFSET 32 -#define C20159_OFFSET 48 -#define C1441_OFFSET 64 -#define C1353_OFFSET 80 -#define ZETA_NTT_OFFSET 96 -#define ZETA_INTT_OFFSET 1104 +#define MLK_PPC_NQ_OFFSET 0 +#define MLK_PPC_QINV_OFFSET 16 +#define MLK_PPC_Q_OFFSET 32 +#define MLK_PPC_C20159_OFFSET 48 +#define MLK_PPC_C1441_OFFSET 64 +#define MLK_PPC_C1353_OFFSET 80 +#define MLK_PPC_ZETA_NTT_OFFSET 96 +#define MLK_PPC_ZETA_INTT_OFFSET 1104 /* check-magic: on */ #ifndef __ASSEMBLER__ diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc index d0203dd178..7b0c6d9314 100644 --- a/mlkem/src/native/ppc64le/src/consts_intt.inc +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -4,7 +4,7 @@ */ /* - * For intt Len=2, offset IZETA_NTT_OFFSET127 + * For intt Len=2, * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) * Transpose z[0], z[1], z[2], z[3] * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index 6ddad4f3d6..d28deeb53f 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -22,8 +22,8 @@ .text .balign 16 -.global MLK_ASM_NAMESPACE(intt_ppc) -MLK_ASM_FN_SYMBOL(intt_ppc) +.global MLK_ASM_NAMESPACE(intt_ppc_asm) +MLK_ASM_FN_SYMBOL(intt_ppc_asm) .cfi_startproc stdu 1, -352(1) @@ -95,7 +95,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc) xxlor 35, 3, 3 xxlor 36, 4, 4 -Lintt_ppc__Loopf: +Lintt_ppc_asm_Loopf: lxvd2x 57, 0, 3 lxvd2x 58, 10, 3 lxvd2x 62, 11, 3 @@ -156,7 +156,7 @@ Lintt_ppc__Loopf: stxvd2x 55, 17, 3 stxvd2x 60, 18, 3 addi 3, 3, 128 - bdnz Lintt_ppc__Loopf + bdnz Lintt_ppc_asm_Loopf addi 3, 3, -512 nop nop diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 6957a8bccb..09b3bc946b 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -22,8 +22,8 @@ .text .balign 16 -.global MLK_ASM_NAMESPACE(ntt_ppc) -MLK_ASM_FN_SYMBOL(ntt_ppc) +.global MLK_ASM_NAMESPACE(ntt_ppc_asm) +MLK_ASM_FN_SYMBOL(ntt_ppc_asm) .cfi_startproc stdu 1, -352(1) diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 7b2acc08af..482b189a6e 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -30,8 +30,8 @@ .text .balign 16 -.global MLK_ASM_NAMESPACE(poly_tomont_ppc) -MLK_ASM_FN_SYMBOL(poly_tomont_ppc) +.global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) .cfi_startproc stdu 1, -320(1) diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 634f9920b8..16f6f7b826 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -23,8 +23,8 @@ .text .balign 16 -.global MLK_ASM_NAMESPACE(reduce_ppc) -MLK_ASM_FN_SYMBOL(reduce_ppc) +.global MLK_ASM_NAMESPACE(reduce_ppc_asm) +MLK_ASM_FN_SYMBOL(reduce_ppc_asm) .cfi_startproc stdu 1, -224(1) From e246cc7d53103a9ae190697232f640838983c19d Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Tue, 5 May 2026 08:06:53 -0400 Subject: [PATCH 17/27] Fixed missing balign in intt_ppc.S. Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc.S index 1b4319e381..ec0097dbed 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc.S @@ -657,7 +657,7 @@ intt_ppc_asm_Loopf: addi rinp, rinp, -512 -.align 16 +.balign 16 /* * Layer 1. len = 2 * leg1 offset - 0, 32, 64, 96 From 79219776edfbfa45df7c99a64e17d4e539fdfc59 Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Wed, 6 May 2026 18:45:56 -0400 Subject: [PATCH 18/27] Fixed CFI issue by re-running simpasm without cfify. Not sure if it is correct. ./scripts/tests func runs fine. Signed-off-by: Danny Tsen --- mlkem/src/native/ppc64le/src/intt_ppc.S | 151 +++++++++++---------- mlkem/src/native/ppc64le/src/ntt_ppc.S | 147 ++++++++++---------- mlkem/src/native/ppc64le/src/poly_tomont.S | 15 +- mlkem/src/native/ppc64le/src/reduce.S | 18 +-- 4 files changed, 167 insertions(+), 164 deletions(-) diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc.S index d28deeb53f..50537aaebf 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * * =================================================================================== @@ -19,13 +17,11 @@ * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. */ - .text -.balign 16 +.balign 4 .global MLK_ASM_NAMESPACE(intt_ppc_asm) MLK_ASM_FN_SYMBOL(intt_ppc_asm) - .cfi_startproc stdu 1, -352(1) mflr 0 std 14, 56(1) @@ -95,7 +91,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) xxlor 35, 3, 3 xxlor 36, 4, 4 -Lintt_ppc_asm_Loopf: +intt_ppc_asm_Loopf: lxvd2x 57, 0, 3 lxvd2x 58, 10, 3 lxvd2x 62, 11, 3 @@ -156,13 +152,13 @@ Lintt_ppc_asm_Loopf: stxvd2x 55, 17, 3 stxvd2x 60, 18, 3 addi 3, 3, 128 - bdnz Lintt_ppc_asm_Loopf + bdnz intt_ppc_asm_Loopf addi 3, 3, -512 nop nop addi 14, 4, 1104 li 7, 4 - mr 5, 3 + mr 5, 3 lxvd2x 57, 0, 5 lxvd2x 58, 10, 5 vmrgew 8, 25, 26 @@ -680,24 +676,24 @@ Lintt_ppc_asm_Loopf: stxvd2x 49, 18, 5 addi 5, 5, 128 nop - mr 5, 3 + mr 5, 3 li 7, 8 lxvd2x 10, 0, 5 lxvd2x 11, 10, 5 - xxmrgld 40, 11, 10 - xxmrghd 53, 11, 10 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 - xxmrgld 44, 11, 10 - xxmrghd 54, 11, 10 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 lxvd2x 10, 15, 5 lxvd2x 11, 16, 5 - xxmrgld 48, 11, 10 - xxmrghd 55, 11, 10 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 lxvd2x 10, 17, 5 lxvd2x 11, 18, 5 - xxmrgld 52, 11, 10 - xxmrghd 56, 11, 10 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 vsubuhm 25, 8, 21 vsubuhm 26, 12, 22 vsubuhm 30, 16, 23 @@ -794,14 +790,14 @@ Lintt_ppc_asm_Loopf: xxlor 51, 11, 11 xxlor 56, 12, 12 xxlor 61, 13, 13 - xxmrgld 42, 46, 45 - xxmrghd 43, 46, 45 - xxmrgld 44, 51, 50 - xxmrghd 45, 51, 50 - xxmrgld 46, 56, 55 - xxmrghd 47, 56, 55 - xxmrgld 48, 61, 60 - xxmrghd 49, 61, 60 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 stxvd2x 42, 0, 5 stxvd2x 43, 10, 5 stxvd2x 44, 11, 5 @@ -813,20 +809,20 @@ Lintt_ppc_asm_Loopf: addi 5, 5, 128 lxvd2x 10, 0, 5 lxvd2x 11, 10, 5 - xxmrgld 40, 11, 10 - xxmrghd 53, 11, 10 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 - xxmrgld 44, 11, 10 - xxmrghd 54, 11, 10 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 lxvd2x 10, 15, 5 lxvd2x 11, 16, 5 - xxmrgld 48, 11, 10 - xxmrghd 55, 11, 10 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 lxvd2x 10, 17, 5 lxvd2x 11, 18, 5 - xxmrgld 52, 11, 10 - xxmrghd 56, 11, 10 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 vsubuhm 25, 8, 21 vsubuhm 26, 12, 22 vsubuhm 30, 16, 23 @@ -923,14 +919,14 @@ Lintt_ppc_asm_Loopf: xxlor 51, 11, 11 xxlor 56, 12, 12 xxlor 61, 13, 13 - xxmrgld 42, 46, 45 - xxmrghd 43, 46, 45 - xxmrgld 44, 51, 50 - xxmrghd 45, 51, 50 - xxmrgld 46, 56, 55 - xxmrghd 47, 56, 55 - xxmrgld 48, 61, 60 - xxmrghd 49, 61, 60 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 stxvd2x 42, 0, 5 stxvd2x 43, 10, 5 stxvd2x 44, 11, 5 @@ -942,20 +938,20 @@ Lintt_ppc_asm_Loopf: addi 5, 5, 128 lxvd2x 10, 0, 5 lxvd2x 11, 10, 5 - xxmrgld 40, 11, 10 - xxmrghd 53, 11, 10 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 - xxmrgld 44, 11, 10 - xxmrghd 54, 11, 10 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 lxvd2x 10, 15, 5 lxvd2x 11, 16, 5 - xxmrgld 48, 11, 10 - xxmrghd 55, 11, 10 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 lxvd2x 10, 17, 5 lxvd2x 11, 18, 5 - xxmrgld 52, 11, 10 - xxmrghd 56, 11, 10 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 vsubuhm 25, 8, 21 vsubuhm 26, 12, 22 vsubuhm 30, 16, 23 @@ -1052,14 +1048,14 @@ Lintt_ppc_asm_Loopf: xxlor 51, 11, 11 xxlor 56, 12, 12 xxlor 61, 13, 13 - xxmrgld 42, 46, 45 - xxmrghd 43, 46, 45 - xxmrgld 44, 51, 50 - xxmrghd 45, 51, 50 - xxmrgld 46, 56, 55 - xxmrghd 47, 56, 55 - xxmrgld 48, 61, 60 - xxmrghd 49, 61, 60 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 stxvd2x 42, 0, 5 stxvd2x 43, 10, 5 stxvd2x 44, 11, 5 @@ -1071,20 +1067,20 @@ Lintt_ppc_asm_Loopf: addi 5, 5, 128 lxvd2x 10, 0, 5 lxvd2x 11, 10, 5 - xxmrgld 40, 11, 10 - xxmrghd 53, 11, 10 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 lxvd2x 10, 11, 5 lxvd2x 11, 12, 5 - xxmrgld 44, 11, 10 - xxmrghd 54, 11, 10 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 lxvd2x 10, 15, 5 lxvd2x 11, 16, 5 - xxmrgld 48, 11, 10 - xxmrghd 55, 11, 10 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 lxvd2x 10, 17, 5 lxvd2x 11, 18, 5 - xxmrgld 52, 11, 10 - xxmrghd 56, 11, 10 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 vsubuhm 25, 8, 21 vsubuhm 26, 12, 22 vsubuhm 30, 16, 23 @@ -1181,14 +1177,14 @@ Lintt_ppc_asm_Loopf: xxlor 51, 11, 11 xxlor 56, 12, 12 xxlor 61, 13, 13 - xxmrgld 42, 46, 45 - xxmrghd 43, 46, 45 - xxmrgld 44, 51, 50 - xxmrghd 45, 51, 50 - xxmrgld 46, 56, 55 - xxmrghd 47, 56, 55 - xxmrgld 48, 61, 60 - xxmrghd 49, 61, 60 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 stxvd2x 42, 0, 5 stxvd2x 43, 10, 5 stxvd2x 44, 11, 5 @@ -3412,7 +3408,12 @@ Lintt_ppc_asm_Loopf: mtlr 0 addi 1, 1, 352 blr - .cfi_endproc + +MLK_ASM_FN_SIZE(intt_ppc_asm) #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc.S index 09b3bc946b..50354a2861 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * * =================================================================================== @@ -19,13 +17,11 @@ * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. */ - .text -.balign 16 +.balign 4 .global MLK_ASM_NAMESPACE(ntt_ppc_asm) MLK_ASM_FN_SYMBOL(ntt_ppc_asm) - .cfi_startproc stdu 1, -352(1) mflr 0 std 14, 56(1) @@ -1183,7 +1179,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 58, 3, 19 stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 - mr 5, 3 + mr 5, 3 li 7, 8 li 10, 16 li 11, 32 @@ -1204,20 +1200,20 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) addi 14, 14, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 - xxmrgld 45, 2, 1 - xxmrghd 44, 2, 1 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 - xxmrgld 50, 4, 3 - xxmrghd 49, 4, 3 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 lxvd2x 1, 15, 5 lxvd2x 2, 16, 5 - xxmrgld 55, 2, 1 - xxmrghd 54, 2, 1 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 lxvd2x 3, 17, 5 lxvd2x 4, 18, 5 - xxmrgld 60, 4, 3 - xxmrghd 59, 4, 3 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 vmladduhm 15, 13, 7, 3 vmladduhm 20, 18, 8, 3 vmladduhm 25, 23, 9, 3 @@ -1246,14 +1242,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) vadduhm 25, 23, 22 vsubuhm 31, 27, 28 vadduhm 30, 28, 27 - xxmrgld 0, 47, 48 - xxmrghd 1, 47, 48 - xxmrgld 2, 52, 53 - xxmrghd 3, 52, 53 - xxmrgld 4, 57, 58 - xxmrghd 5, 57, 58 - xxmrgld 6, 62, 63 - xxmrghd 7, 62, 63 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 stxvd2x 0, 0, 5 stxvd2x 1, 10, 5 stxvd2x 2, 11, 5 @@ -1273,20 +1269,20 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) addi 14, 14, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 - xxmrgld 45, 2, 1 - xxmrghd 44, 2, 1 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 - xxmrgld 50, 4, 3 - xxmrghd 49, 4, 3 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 lxvd2x 1, 15, 5 lxvd2x 2, 16, 5 - xxmrgld 55, 2, 1 - xxmrghd 54, 2, 1 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 lxvd2x 3, 17, 5 lxvd2x 4, 18, 5 - xxmrgld 60, 4, 3 - xxmrghd 59, 4, 3 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 vmladduhm 15, 13, 7, 3 vmladduhm 20, 18, 8, 3 vmladduhm 25, 23, 9, 3 @@ -1315,14 +1311,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) vadduhm 25, 23, 22 vsubuhm 31, 27, 28 vadduhm 30, 28, 27 - xxmrgld 0, 47, 48 - xxmrghd 1, 47, 48 - xxmrgld 2, 52, 53 - xxmrghd 3, 52, 53 - xxmrgld 4, 57, 58 - xxmrghd 5, 57, 58 - xxmrgld 6, 62, 63 - xxmrghd 7, 62, 63 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 stxvd2x 0, 0, 5 stxvd2x 1, 10, 5 stxvd2x 2, 11, 5 @@ -1342,20 +1338,20 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) addi 14, 14, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 - xxmrgld 45, 2, 1 - xxmrghd 44, 2, 1 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 - xxmrgld 50, 4, 3 - xxmrghd 49, 4, 3 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 lxvd2x 1, 15, 5 lxvd2x 2, 16, 5 - xxmrgld 55, 2, 1 - xxmrghd 54, 2, 1 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 lxvd2x 3, 17, 5 lxvd2x 4, 18, 5 - xxmrgld 60, 4, 3 - xxmrghd 59, 4, 3 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 vmladduhm 15, 13, 7, 3 vmladduhm 20, 18, 8, 3 vmladduhm 25, 23, 9, 3 @@ -1384,14 +1380,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) vadduhm 25, 23, 22 vsubuhm 31, 27, 28 vadduhm 30, 28, 27 - xxmrgld 0, 47, 48 - xxmrghd 1, 47, 48 - xxmrgld 2, 52, 53 - xxmrghd 3, 52, 53 - xxmrgld 4, 57, 58 - xxmrghd 5, 57, 58 - xxmrgld 6, 62, 63 - xxmrghd 7, 62, 63 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 stxvd2x 0, 0, 5 stxvd2x 1, 10, 5 stxvd2x 2, 11, 5 @@ -1411,20 +1407,20 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) addi 14, 14, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 - xxmrgld 45, 2, 1 - xxmrghd 44, 2, 1 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 lxvd2x 3, 11, 5 lxvd2x 4, 12, 5 - xxmrgld 50, 4, 3 - xxmrghd 49, 4, 3 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 lxvd2x 1, 15, 5 lxvd2x 2, 16, 5 - xxmrgld 55, 2, 1 - xxmrghd 54, 2, 1 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 lxvd2x 3, 17, 5 lxvd2x 4, 18, 5 - xxmrgld 60, 4, 3 - xxmrghd 59, 4, 3 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 vmladduhm 15, 13, 7, 3 vmladduhm 20, 18, 8, 3 vmladduhm 25, 23, 9, 3 @@ -1453,14 +1449,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) vadduhm 25, 23, 22 vsubuhm 31, 27, 28 vadduhm 30, 28, 27 - xxmrgld 0, 47, 48 - xxmrghd 1, 47, 48 - xxmrgld 2, 52, 53 - xxmrghd 3, 52, 53 - xxmrgld 4, 57, 58 - xxmrghd 5, 57, 58 - xxmrgld 6, 62, 63 - xxmrghd 7, 62, 63 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 stxvd2x 0, 0, 5 stxvd2x 1, 10, 5 stxvd2x 2, 11, 5 @@ -1470,7 +1466,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 6, 17, 5 stxvd2x 7, 18, 5 addi 5, 5, 128 - mr 5, 3 + mr 5, 3 li 7, 4 nop nop @@ -1785,7 +1781,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) mtlr 0 addi 1, 1, 352 blr - .cfi_endproc + +MLK_ASM_FN_SIZE(ntt_ppc_asm) #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont.S index 482b189a6e..abe354a69b 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont.S @@ -1,9 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* + * * Copyright IBM Corp. 2025, 2026 * *=================================================================================== @@ -27,13 +25,11 @@ * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. */ - .text -.balign 16 +.balign 4 .global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) - .cfi_startproc stdu 1, -320(1) mflr 0 li 6, 128 @@ -355,7 +351,12 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) mtlr 0 addi 1, 1, 320 blr - .cfi_endproc + +MLK_ASM_FN_SIZE(poly_tomont_ppc_asm) #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce.S index 16f6f7b826..8d16f118ba 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce.S @@ -1,10 +1,7 @@ /* * Copyright (c) The mlkem-native project authors * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - - -/* + * * Copyright IBM Corp. 2025, 2026 * *=================================================================================== @@ -20,13 +17,11 @@ * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. */ - .text -.balign 16 +.balign 4 .global MLK_ASM_NAMESPACE(reduce_ppc_asm) MLK_ASM_FN_SYMBOL(reduce_ppc_asm) - .cfi_startproc stdu 1, -224(1) mflr 0 std 14, 96(1) @@ -490,7 +485,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) addi 3, 3, -512 vxor 9, 9, 9 vspltish 10, 15 - vmr 11, 3 + vmr 11, 3 lxvd2x 44, 0, 3 lxvd2x 45, 14, 3 lxvd2x 46, 15, 3 @@ -707,7 +702,12 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) mtlr 0 addi 1, 1, 224 blr - .cfi_endproc + +MLK_ASM_FN_SIZE(reduce_ppc_asm) #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif From 1289fe4d883ac04992cb073e0c37b738c52263bf Mon Sep 17 00:00:00 2001 From: Danny Tsen Date: Thu, 7 May 2026 10:56:41 -0400 Subject: [PATCH 19/27] This patch fixed the following, 1. Fixed wrong path in ML-KEM-768_META.yml. 2. Added __POWER8_VECTOR__ guard in asm files and meta.h. 3. Fixed capitalization in asm macros and misc. 4. Renamed asm files to _ppc_asm.S. Signed-off-by: Danny Tsen --- dev/ppc64le/README.md | 2 +- dev/ppc64le/meta.h | 16 ++++ .../src/{intt_ppc.S => intt_ppc_asm.S} | 94 +++++++++---------- dev/ppc64le/src/{ntt_ppc.S => ntt_ppc_asm.S} | 76 +++++++-------- .../{poly_tomont.S => poly_tomont_ppc_asm.S} | 24 ++--- .../src/{reduce.S => reduce_ppc_asm.S} | 22 ++--- integration/liboqs/ML-KEM-768_META.yml | 2 +- mlkem/src/native/ppc64le/README.md | 2 +- mlkem/src/native/ppc64le/meta.h | 16 ++++ .../src/{intt_ppc.S => intt_ppc_asm.S} | 4 +- .../ppc64le/src/{ntt_ppc.S => ntt_ppc_asm.S} | 4 +- .../{poly_tomont.S => poly_tomont_ppc_asm.S} | 4 +- .../src/{reduce.S => reduce_ppc_asm.S} | 4 +- test/mk/components.mk | 2 +- 14 files changed, 152 insertions(+), 120 deletions(-) rename dev/ppc64le/src/{intt_ppc.S => intt_ppc_asm.S} (91%) rename dev/ppc64le/src/{ntt_ppc.S => ntt_ppc_asm.S} (90%) rename dev/ppc64le/src/{poly_tomont.S => poly_tomont_ppc_asm.S} (89%) rename dev/ppc64le/src/{reduce.S => reduce_ppc_asm.S} (91%) rename mlkem/src/native/ppc64le/src/{intt_ppc.S => intt_ppc_asm.S} (99%) rename mlkem/src/native/ppc64le/src/{ntt_ppc.S => ntt_ppc_asm.S} (99%) rename mlkem/src/native/ppc64le/src/{poly_tomont.S => poly_tomont_ppc_asm.S} (98%) rename mlkem/src/native/ppc64le/src/{reduce.S => reduce_ppc_asm.S} (99%) diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md index 6103a76646..def9e7e7ef 100644 --- a/dev/ppc64le/README.md +++ b/dev/ppc64le/README.md @@ -3,5 +3,5 @@ # ppc64le backend (little endian) This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. -Or, Power systems supports ISA 2.07 and above. +Or, Power systems supporting ISA 2.07 and above. diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 6692c06483..616c14f53c 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -28,29 +28,45 @@ MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } #endif /* !__ASSEMBLER__ */ diff --git a/dev/ppc64le/src/intt_ppc.S b/dev/ppc64le/src/intt_ppc_asm.S similarity index 91% rename from dev/ppc64le/src/intt_ppc.S rename to dev/ppc64le/src/intt_ppc_asm.S index ec0097dbed..271d3e27fa 100644 --- a/dev/ppc64le/src/intt_ppc.S +++ b/dev/ppc64le/src/intt_ppc_asm.S @@ -10,7 +10,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* simpasm: header-end */ #include "consts.h" @@ -299,7 +299,7 @@ xxpermdi 32+vdata_a4, 11, 10, 0 .endm -.macro BREDUCE_4X _v0, _v1, _v2, _v3 +.macro barrett_reduce_4x _v0, _v1, _v2, _v3 /* Restore constant vectors V_MKQ, V2pw25 and V_26 */ vxor 7, 7, 7 @@ -360,9 +360,9 @@ /* * ----------------------------------- - * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) + * mont_reduce_4x(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) */ -.macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3 +.macro mont_reduce_4x _vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3 /* Modular multiplication bound by 2^16 * q in abs value */ vmladduhm 15, vdata_mont1, \_vz0, rinp vmladduhm 20, vdata_mont2, \_vz1, rinp @@ -506,60 +506,60 @@ /* * INTT layer 1, Len=2. */ -.macro INTT_REDUCE_L24 +.macro intt_layer1 Load_L24Coeffs Compute_4Coeffs - BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 xxlor 10, 32+vresult_brt1, 32+vresult_brt1 xxlor 11, 32+vresult_brt2, 32+vresult_brt2 xxlor 12, 32+vresult_brt3, 32+vresult_brt3 xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL24 .endm /* * INTT layer 2, Len=4. */ -.macro INTT_REDUCE_L44 +.macro intt_layer2 Load_L44Coeffs Compute_4Coeffs - BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 xxlor 10, 32+vresult_brt1, 32+vresult_brt1 xxlor 11, 32+vresult_brt2, 32+vresult_brt2 xxlor 12, 32+vresult_brt3, 32+vresult_brt3 xxlor 13, 32+vresult_brt4, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 PermWriteL44 .endm /* * INTT layer 3 and 4, Len=8 and 16. */ -.macro INTT_REDUCE_4X start, next +.macro intt_layer34 start, next Load_4Coeffs \start, \next - BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 Set_mont_consts Load_next_4zetas - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 .endm /* * INTT layer 5, 6 and 7, Len=32, 64 and 128. */ -.macro INTT_REDUCE_L567 start, next +.macro intt_layer567 start, next Load_4Coeffs \start, \next - BREDUCE_4X vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 Set_mont_consts lvx V_ZETA, 0, 14 - MREDUCE_4X V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + mont_reduce_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 .endm @@ -649,9 +649,9 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) Set_mont_consts intt_ppc_asm_Loopf: Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 6, 7, 8, 9 + mont_reduce_4x V1441, V1441, V1441, V1441, 6, 7, 8, 9 Reload_4coeffs - MREDUCE_4X V1441, V1441, V1441, V1441, 13, 18, 23, 28 + mont_reduce_4x V1441, V1441, V1441, V1441, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc_asm_Loopf @@ -670,13 +670,13 @@ intt_ppc_asm_Loopf: li len_2, 4 /* len * 2 */ mr dup_rinp, rinp - INTT_REDUCE_L24 + intt_layer1 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L24 + intt_layer1 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L24 + intt_layer1 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L24 + intt_layer1 addi dup_rinp, dup_rinp, 128 .balign 16 @@ -690,13 +690,13 @@ intt_ppc_asm_Loopf: mr dup_rinp, rinp li len_2, 8 - INTT_REDUCE_L44 + intt_layer2 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L44 + intt_layer2 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L44 + intt_layer2 addi dup_rinp, dup_rinp, 128 - INTT_REDUCE_L44 + intt_layer2 addi dup_rinp, dup_rinp, 128 .balign 16 @@ -705,10 +705,10 @@ intt_ppc_asm_Loopf: */ li len_2, 16 - INTT_REDUCE_4X 0, 32 - INTT_REDUCE_4X 128, 32 - INTT_REDUCE_4X 256, 32 - INTT_REDUCE_4X 384, 32 + intt_layer34 0, 32 + intt_layer34 128, 32 + intt_layer34 256, 32 + intt_layer34 384, 32 .balign 16 /* @@ -716,15 +716,15 @@ intt_ppc_asm_Loopf: */ li len_2, 32 - INTT_REDUCE_4X 0, 64 + intt_layer34 0, 64 addi zeta_inp, zeta_inp, -64 - INTT_REDUCE_4X 16, 64 + intt_layer34 16, 64 - INTT_REDUCE_4X 256, 64 + intt_layer34 256, 64 addi zeta_inp, zeta_inp, -64 - INTT_REDUCE_4X 272, 64 + intt_layer34 272, 64 .balign 16 /* @@ -732,13 +732,13 @@ intt_ppc_asm_Loopf: */ li len_2, 64 - INTT_REDUCE_L567 0, 16 + intt_layer567 0, 16 addi zeta_inp, zeta_inp, 16 - INTT_REDUCE_L567 128, 16 + intt_layer567 128, 16 addi zeta_inp, zeta_inp, 16 - INTT_REDUCE_L567 256, 16 + intt_layer567 256, 16 addi zeta_inp, zeta_inp, 16 - INTT_REDUCE_L567 384, 16 + intt_layer567 384, 16 addi zeta_inp, zeta_inp, 16 .balign 16 @@ -747,11 +747,11 @@ intt_ppc_asm_Loopf: */ li len_2, 128 - INTT_REDUCE_L567 0, 16 - INTT_REDUCE_L567 64, 16 + intt_layer567 0, 16 + intt_layer567 64, 16 addi zeta_inp, zeta_inp, 16 - INTT_REDUCE_L567 256, 16 - INTT_REDUCE_L567 320, 16 + intt_layer567 256, 16 + intt_layer567 320, 16 addi zeta_inp, zeta_inp, 16 .balign 16 @@ -760,10 +760,10 @@ intt_ppc_asm_Loopf: */ li len_2, 256 /* len*2 */ - INTT_REDUCE_L567 0, 16 - INTT_REDUCE_L567 64, 16 - INTT_REDUCE_L567 128, 16 - INTT_REDUCE_L567 192, 16 + intt_layer567 0, 16 + intt_layer567 64, 16 + intt_layer567 128, 16 + intt_layer567 192, 16 RESTORE_REGS blr @@ -821,5 +821,5 @@ intt_ppc_asm_Loopf: #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ diff --git a/dev/ppc64le/src/ntt_ppc.S b/dev/ppc64le/src/ntt_ppc_asm.S similarity index 90% rename from dev/ppc64le/src/ntt_ppc.S rename to dev/ppc64le/src/ntt_ppc_asm.S index e9dae73ac8..56f48f6080 100644 --- a/dev/ppc64le/src/ntt_ppc.S +++ b/dev/ppc64le/src/ntt_ppc_asm.S @@ -10,7 +10,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* simpasm: header-end */ #include "consts.h" @@ -261,9 +261,9 @@ * t = (a - (int32_t)t*_MLKEM_Q) >> 16 * * ----------------------------------- - * MREDUCE_4X(_vz0, _vz1, _vz2, _vz3) + * mont_reduce_4x(_vz0, _vz1, _vz2, _vz3) */ -.macro MREDUCE_4X _vz0, _vz1, _vz2, _vz3 +.macro mont_reduce_4x _vz0, _vz1, _vz2, _vz3 /* fqmul = zeta * coefficient Modular multiplication bound by 2^16 * q in abs value */ vmladduhm 15, vdata_b1, \_vz0, rinp @@ -401,10 +401,10 @@ /* * NTT layer 7, Len=2. */ -.macro NTT_REDUCE_L24 +.macro ntt_layer7 Load_next_4zetas Load_L24Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL24 addi dup_rinp, dup_rinp, 128 .endm @@ -412,10 +412,10 @@ /* * NTT layer 6, Len=4. */ -.macro NTT_REDUCE_L44 +.macro ntt_layer6 Load_next_4zetas Load_L44Coeffs - MREDUCE_4X V_Z0, V_Z1, V_Z2, V_Z3 + mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3 PermWriteL44 addi dup_rinp, dup_rinp, 128 .endm @@ -423,9 +423,9 @@ /* * NTT other layers, 1, 2, 3, 4, 5. */ -.macro NTT_MREDUCE_4X start, next, _vz0, _vz1, _vz2, _vz3 +.macro ntt_layer12345 start, next, _vz0, _vz1, _vz2, _vz3 Load_4Coeffs \start, \next - MREDUCE_4x \_vz0, \_vz1, \_vz2, \_vz3 + mont_reduce_4x \_vz0, \_vz1, \_vz2, \_vz3 Load_4Rj Compute_4Coeffs Write_One @@ -492,10 +492,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - NTT_MREDUCE_4X 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .balign 16 /* @@ -504,13 +504,13 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li len_2, 128 lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - NTT_MREDUCE_4X 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - NTT_MREDUCE_4X 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .balign 16 /* @@ -519,19 +519,19 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li len_2, 64 lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA lvx V_ZETA, 0, zeta_inp addi zeta_inp, zeta_inp, 16 - NTT_MREDUCE_4X 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA .balign 16 /* @@ -539,12 +539,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 32 Load_next_4zetas - NTT_MREDUCE_4X 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 - NTT_MREDUCE_4X 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 Load_next_4zetas - NTT_MREDUCE_4X 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 - NTT_MREDUCE_4X 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 .balign 16 /* @@ -552,16 +552,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 16 Load_next_4zetas - NTT_MREDUCE_4X 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 Load_next_4zetas - NTT_MREDUCE_4X 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 Load_next_4zetas - NTT_MREDUCE_4X 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 Load_next_4zetas - NTT_MREDUCE_4X 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 /* * Layer 6. len = 4, @@ -582,10 +582,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 18, 112 .balign 16 - NTT_REDUCE_L44 - NTT_REDUCE_L44 - NTT_REDUCE_L44 - NTT_REDUCE_L44 + ntt_layer6 + ntt_layer6 + ntt_layer6 + ntt_layer6 /* * Layer 7. len = 2 @@ -598,10 +598,10 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li len_2, 4 .balign 16 - NTT_REDUCE_L24 - NTT_REDUCE_L24 - NTT_REDUCE_L24 - NTT_REDUCE_L24 + ntt_layer7 + ntt_layer7 + ntt_layer7 + ntt_layer7 RESTORE_REGS blr @@ -646,5 +646,5 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ diff --git a/dev/ppc64le/src/poly_tomont.S b/dev/ppc64le/src/poly_tomont_ppc_asm.S similarity index 89% rename from dev/ppc64le/src/poly_tomont.S rename to dev/ppc64le/src/poly_tomont_ppc_asm.S index 9b35eb8dc0..12779545a6 100644 --- a/dev/ppc64le/src/poly_tomont.S +++ b/dev/ppc64le/src/poly_tomont_ppc_asm.S @@ -18,7 +18,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* simpasm: header-end */ #include "consts.h" @@ -35,10 +35,10 @@ * t = (a - (int32_t)t*_MLKEM_Q) >> 16 * *----------------------------------- - * MREDUCE_4X(_v0, _v1, _v2, _v3) + * mont_reduce_4x(_v0, _v1, _v2, _v3) */ -.macro MREDUCE_4X _v0, _v1, _v2, _v3 +.macro mont_reduce_4x _v0, _v1, _v2, _v3 lxvd2x 32+13, 0, 3 addi 3, 3, 16 lxvd2x 32+18, 0, 3 @@ -133,20 +133,20 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) li 10, -32 li 11, -16 - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 Write_8X - MREDUCE_4X 27, 28, 29, 30 - MREDUCE_4X 13, 18, 23, 7 + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 Write_8X li 6, 128 @@ -182,5 +182,5 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) #undef V_NMKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ diff --git a/dev/ppc64le/src/reduce.S b/dev/ppc64le/src/reduce_ppc_asm.S similarity index 91% rename from dev/ppc64le/src/reduce.S rename to dev/ppc64le/src/reduce_ppc_asm.S index a560191f9f..51490d1f9c 100644 --- a/dev/ppc64le/src/reduce.S +++ b/dev/ppc64le/src/reduce_ppc_asm.S @@ -10,7 +10,7 @@ */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* simpasm: header-end */ #include "consts.h" @@ -30,7 +30,7 @@ .text -.macro BREDUCE_4X _v0, _v1, _v2, _v3 +.macro barrett_reduce_4x _v0, _v1, _v2, _v3 lxvd2x 32+8, 0, 3 lxvd2x 32+12, 14, 3 lxvd2x 32+16, 15, 3 @@ -169,20 +169,20 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) li 15, 32 li 16, 48 - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 Write_8X - BREDUCE_4X 21, 22, 23, 24 - BREDUCE_4X 4, 9, 13, 17 + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 Write_8X .balign 16 @@ -228,5 +228,5 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) #undef V_MKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 1b01c4d426..28dceb229d 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -92,7 +92,7 @@ implementations: - name: ppc64le version: FIPS203 folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="....//integration/liboqs/config_ppc64le.h" + compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md index 7f29b4fa02..733e32e113 100644 --- a/mlkem/src/native/ppc64le/README.md +++ b/mlkem/src/native/ppc64le/README.md @@ -3,4 +3,4 @@ # ppc64le backend (little endian) This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. -Or, Power systems supports ISA 2.07 and above. +Or, Power systems supporting ISA 2.07 and above. diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 4b8fa13232..4a3018c2ac 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -28,29 +28,45 @@ MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } MLK_MUST_CHECK_RETURN_VALUE static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) { +#if defined(__POWER8_VECTOR__) mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; +#else + return MLK_NATIVE_FUNC_FALLBACK; +#endif } #endif /* !__ASSEMBLER__ */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S similarity index 99% rename from mlkem/src/native/ppc64le/src/intt_ppc.S rename to mlkem/src/native/ppc64le/src/intt_ppc_asm.S index 50537aaebf..ae08438d8e 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -10,7 +10,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* * WARNING: This file is auto-derived from the mlkem-native source file @@ -3411,7 +3411,7 @@ intt_ppc_asm_Loopf: MLK_ASM_FN_SIZE(intt_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ #if defined(__ELF__) diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S similarity index 99% rename from mlkem/src/native/ppc64le/src/ntt_ppc.S rename to mlkem/src/native/ppc64le/src/ntt_ppc_asm.S index 50354a2861..d017871578 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -10,7 +10,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* * WARNING: This file is auto-derived from the mlkem-native source file @@ -1784,7 +1784,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) MLK_ASM_FN_SIZE(ntt_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ #if defined(__ELF__) diff --git a/mlkem/src/native/ppc64le/src/poly_tomont.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S similarity index 98% rename from mlkem/src/native/ppc64le/src/poly_tomont.S rename to mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S index abe354a69b..acb25c4858 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -18,7 +18,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* * WARNING: This file is auto-derived from the mlkem-native source file @@ -354,7 +354,7 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) MLK_ASM_FN_SIZE(poly_tomont_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ #if defined(__ELF__) diff --git a/mlkem/src/native/ppc64le/src/reduce.S b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S similarity index 99% rename from mlkem/src/native/ppc64le/src/reduce.S rename to mlkem/src/native/ppc64le/src/reduce_ppc_asm.S index 8d16f118ba..9715b76d64 100644 --- a/mlkem/src/native/ppc64le/src/reduce.S +++ b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S @@ -10,7 +10,7 @@ */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) /* * WARNING: This file is auto-derived from the mlkem-native source file @@ -705,7 +705,7 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) MLK_ASM_FN_SIZE(reduce_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ */ #if defined(__ELF__) diff --git a/test/mk/components.mk b/test/mk/components.mk index 5c64ab4a8c..e083005f8e 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -9,7 +9,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) - SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif From f3705ddc331ebbc55ccecb17ea7553c8a889c9d6 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Wed, 6 May 2026 17:37:23 +0200 Subject: [PATCH 20/27] extend autogen/simpasm for ppc64le backend. Auto-generate zeta values Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.c | 1 - dev/ppc64le/src/consts_intt.inc | 94 +++++++----- dev/ppc64le/src/consts_ntt.inc | 94 +++++++----- mlkem/src/native/ppc64le/src/consts.c | 1 - mlkem/src/native/ppc64le/src/consts_intt.inc | 94 +++++++----- mlkem/src/native/ppc64le/src/consts_ntt.inc | 94 +++++++----- scripts/autogen | 146 ++++++++++++++++++- scripts/simpasm | 8 +- 8 files changed, 369 insertions(+), 163 deletions(-) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 48fe773ec8..74c8aa441c 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -69,7 +69,6 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { 1353, /* zetas for NTT */ #include "consts_ntt.inc" - , /* zetas for invNTT */ #include "consts_intt.inc" }; diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc index 7b0c6d9314..0e1dd4367e 100644 --- a/dev/ppc64le/src/consts_intt.inc +++ b/dev/ppc64le/src/consts_intt.inc @@ -3,23 +3,31 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ - /* - * For intt Len=2, - * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) - * Transpose z[0], z[1], z[2], z[3] - * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] - */ - -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, - -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, - -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, - -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, - -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, - 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, - -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, - 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, - -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, - 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, - /* For intt Len=4 */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, + -308, -308, 991, 991, -108, -108, 996, 996, + -854, -854, 478, 478, -1510, -1510, -870, -870, + -1530, -1530, 794, 794, -1185, -1185, -1278, -1278, + 220, 220, -1659, -1659, -874, -874, -1187, -1187, + -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, + 1097, 1097, 610, 610, 817, 817, 603, 603, + 329, 329, -75, -75, 418, 418, -156, -156, + 644, 644, 349, 349, -1590, -1590, -872, -872, + 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, + -460, -460, 1653, 1653, -291, -291, 1574, 1574, + 587, 587, -235, -235, 422, 422, 177, 177, + 871, 871, 105, 105, -1251, -1251, 1550, 1550, + 430, 430, 843, 843, -1103, -1103, 555, 555, 677, 677, 677, 677, -1275, -1275, -1275, -1275, 448, 448, 448, 448, -1065, -1065, -1065, -1065, -1508, -1508, -1508, -1508, -725, -725, -725, -725, @@ -36,24 +44,34 @@ -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1223, 1223, 1223, 1223, 652, 652, 652, 652, - /* For intt Len=8 and others */ - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, - -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, - 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, - 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, - -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, - -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, - 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, - -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, - 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, - -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, - 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, - 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, - -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, - 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, - 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, - -758, -758, -758, -758, -758, -758 + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, + -205, -205, -205, -205, -205, -205, -205, -205, + 411, 411, 411, 411, 411, 411, 411, 411, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + 608, 608, 608, 608, 608, 608, 608, 608, + 732, 732, 732, 732, 732, 732, 732, 732, + 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, + -681, -681, -681, -681, -681, -681, -681, -681, + -130, -130, -130, -130, -130, -130, -130, -130, + -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, + 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + -829, -829, -829, -829, -829, -829, -829, -829, + 383, 383, 383, 383, 383, 383, 383, 383, + 264, 264, 264, 264, 264, 264, 264, 264, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, + 573, 573, 573, 573, 573, 573, 573, 573, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, + -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + 962, 962, 962, 962, 962, 962, 962, 962, + 182, 182, 182, 182, 182, 182, 182, 182, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 622, 622, 622, 622, 622, 622, 622, 622, + -171, -171, -171, -171, -171, -171, -171, -171, + 202, 202, 202, 202, 202, 202, 202, 202, + 287, 287, 287, 287, 287, 287, 287, 287, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -359, -359, -359, -359, -359, -359, -359, -359, + -758, -758, -758, -758, -758, -758, -758, -758, diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc index 2a0136f1e5..2729155aab 100644 --- a/dev/ppc64le/src/consts_ntt.inc +++ b/dev/ppc64le/src/consts_ntt.inc @@ -3,28 +3,46 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, - /* For Len=4 */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -758, -758, -758, -758, -758, -758, -758, -758, + -359, -359, -359, -359, -359, -359, -359, -359, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + -171, -171, -171, -171, -171, -171, -171, -171, + 622, 622, 622, 622, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 182, 182, 182, 182, 182, 182, 182, 182, + 962, 962, 962, 962, 962, 962, 962, 962, + -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, + 264, 264, 264, 264, 264, 264, 264, 264, + 383, 383, 383, 383, 383, 383, 383, 383, + -829, -829, -829, -829, -829, -829, -829, -829, + 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, + -130, -130, -130, -130, -130, -130, -130, -130, + -681, -681, -681, -681, -681, -681, -681, -681, + 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, + 732, 732, 732, 732, 732, 732, 732, 732, + 608, 608, 608, 608, 608, 608, 608, 608, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + 411, 411, 411, 411, 411, 411, 411, 411, + -205, -205, -205, -205, -205, -205, -205, -205, + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, 652, 652, 652, 652, 1223, 1223, 1223, 1223, 1015, 1015, 1015, 1015, -552, -552, -552, -552, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, @@ -41,19 +59,19 @@ -725, -725, -725, -725, -1508, -1508, -1508, -1508, -1065, -1065, -1065, -1065, 448, 448, 448, 448, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - /* - * For ntt Len=2 - * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) - * Transpose z[0], z[1], z[2], z[3] - * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] - */ - 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, - 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, - -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, - -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, - 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, - 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, - 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, - -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, - -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, - 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460 + 555, 555, -1103, -1103, 843, 843, 430, 430, + 1550, 1550, -1251, -1251, 105, 105, 871, 871, + 177, 177, 422, 422, -235, -235, 587, 587, + 1574, 1574, -291, -291, 1653, 1653, -460, -460, + 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, + -872, -872, -1590, -1590, 349, 349, 644, 644, + -156, -156, 418, 418, -75, -75, 329, 329, + 603, 603, 817, 817, 610, 610, 1097, 1097, + -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, + -1187, -1187, -874, -874, -1659, -1659, 220, 220, + -1278, -1278, -1185, -1185, 794, 794, -1530, -1530, + -870, -870, -1510, -1510, 478, 478, -854, -854, + 996, 996, -108, -108, 991, 991, -308, -308, + 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 48fe773ec8..74c8aa441c 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -69,7 +69,6 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { 1353, /* zetas for NTT */ #include "consts_ntt.inc" - , /* zetas for invNTT */ #include "consts_intt.inc" }; diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc index 7b0c6d9314..0e1dd4367e 100644 --- a/mlkem/src/native/ppc64le/src/consts_intt.inc +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -3,23 +3,31 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ - /* - * For intt Len=2, - * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) - * Transpose z[0], z[1], z[2], z[3] - * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] - */ - -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, -308, -308, 991, 991, -108, - -108, 996, 996, -854, -854, 478, 478, -1510, -1510, -870, -870, -1530, - -1530, 794, 794, -1185, -1185, -1278, -1278, 220, 220, -1659, -1659, -874, - -874, -1187, -1187, -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, - -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, 1097, 1097, 610, 610, 817, - 817, 603, 603, 329, 329, -75, -75, 418, 418, -156, -156, 644, 644, 349, 349, - -1590, -1590, -872, -872, 1483, 1483, 1119, 1119, -777, -777, -602, -602, - 778, 778, -147, -147, -246, -246, 1159, 1159, -460, -460, 1653, 1653, -291, - -291, 1574, 1574, 587, 587, -235, -235, 422, 422, 177, 177, 871, 871, 105, - 105, -1251, -1251, 1550, 1550, 430, 430, 843, 843, -1103, -1103, 555, 555, - /* For intt Len=4 */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, + -308, -308, 991, 991, -108, -108, 996, 996, + -854, -854, 478, 478, -1510, -1510, -870, -870, + -1530, -1530, 794, 794, -1185, -1185, -1278, -1278, + 220, 220, -1659, -1659, -874, -874, -1187, -1187, + -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, + -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, + 1097, 1097, 610, 610, 817, 817, 603, 603, + 329, 329, -75, -75, 418, 418, -156, -156, + 644, 644, 349, 349, -1590, -1590, -872, -872, + 1483, 1483, 1119, 1119, -777, -777, -602, -602, + 778, 778, -147, -147, -246, -246, 1159, 1159, + -460, -460, 1653, 1653, -291, -291, 1574, 1574, + 587, 587, -235, -235, 422, 422, 177, 177, + 871, 871, 105, 105, -1251, -1251, 1550, 1550, + 430, 430, 843, 843, -1103, -1103, 555, 555, 677, 677, 677, 677, -1275, -1275, -1275, -1275, 448, 448, 448, 448, -1065, -1065, -1065, -1065, -1508, -1508, -1508, -1508, -725, -725, -725, -725, @@ -36,24 +44,34 @@ -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, -552, -552, -552, -552, 1015, 1015, 1015, 1015, 1223, 1223, 1223, 1223, 652, 652, 652, 652, - /* For intt Len=8 and others */ - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, -205, -205, -205, - -205, -205, -205, -205, -205, 411, 411, 411, 411, 411, 411, 411, 411, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 608, 608, 608, 608, 608, - 608, 608, 608, 732, 732, 732, 732, 732, 732, 732, 732, 1017, 1017, 1017, - 1017, 1017, 1017, 1017, 1017, -681, -681, -681, -681, -681, -681, -681, - -681, -130, -130, -130, -130, -130, -130, -130, -130, -1602, -1602, -1602, - -1602, -1602, -1602, -1602, -1602, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, -829, -829, -829, -829, -829, -829, -829, -829, 383, 383, 383, 383, - 383, 383, 383, 383, 264, 264, 264, 264, 264, 264, 264, 264, -1325, -1325, - -1325, -1325, -1325, -1325, -1325, -1325, 573, 573, 573, 573, 573, 573, 573, - 573, 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, -1474, -1474, -1474, - -1474, -1474, -1474, -1474, -1474, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1202, 962, 962, 962, 962, 962, 962, 962, 962, 182, 182, 182, 182, - 182, 182, 182, 182, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, 622, - 622, 622, 622, 622, 622, 622, 622, -171, -171, -171, -171, -171, -171, -171, - -171, 202, 202, 202, 202, 202, 202, 202, 202, 287, 287, 287, 287, 287, 287, - 287, 287, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1493, 1493, 1493, - 1493, 1493, 1493, 1493, 1493, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, -1517, -359, -359, -359, -359, -359, -359, -359, -359, -758, -758, - -758, -758, -758, -758, -758, -758 + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, + -205, -205, -205, -205, -205, -205, -205, -205, + 411, 411, 411, 411, 411, 411, 411, 411, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + 608, 608, 608, 608, 608, 608, 608, 608, + 732, 732, 732, 732, 732, 732, 732, 732, + 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, + -681, -681, -681, -681, -681, -681, -681, -681, + -130, -130, -130, -130, -130, -130, -130, -130, + -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, + 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + -829, -829, -829, -829, -829, -829, -829, -829, + 383, 383, 383, 383, 383, 383, 383, 383, + 264, 264, 264, 264, 264, 264, 264, 264, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, + 573, 573, 573, 573, 573, 573, 573, 573, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, + -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + 962, 962, 962, 962, 962, 962, 962, 962, + 182, 182, 182, 182, 182, 182, 182, 182, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 622, 622, 622, 622, 622, 622, 622, 622, + -171, -171, -171, -171, -171, -171, -171, -171, + 202, 202, 202, 202, 202, 202, 202, 202, + 287, 287, 287, 287, 287, 287, 287, 287, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + -359, -359, -359, -359, -359, -359, -359, -359, + -758, -758, -758, -758, -758, -758, -758, -758, diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc index 2a0136f1e5..2729155aab 100644 --- a/mlkem/src/native/ppc64le/src/consts_ntt.inc +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -3,28 +3,46 @@ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT */ - /* For ntt Len=128, offset 96 */ - -758, -758, -758, -758, -758, -758, -758, -758, -359, -359, -359, -359, - -359, -359, -359, -359, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -1517, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, - 1422, 1422, 1422, 1422, 1422, 287, 287, 287, 287, 287, 287, 287, 287, 202, - 202, 202, 202, 202, 202, 202, 202, -171, -171, -171, -171, -171, -171, -171, - -171, 622, 622, 622, 622, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 1577, - 1577, 1577, 1577, 182, 182, 182, 182, 182, 182, 182, 182, 962, 962, 962, - 962, 962, 962, 962, 962, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1202, -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, 1468, 1468, - 1468, 1468, 1468, 1468, 1468, 1468, 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, 264, 264, 264, 264, - 264, 264, 264, 264, 383, 383, 383, 383, 383, 383, 383, 383, -829, -829, - -829, -829, -829, -829, -829, -829, 1458, 1458, 1458, 1458, 1458, 1458, - 1458, 1458, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, -130, - -130, -130, -130, -130, -130, -130, -130, -681, -681, -681, -681, -681, - -681, -681, -681, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, 732, 732, - 732, 732, 732, 732, 732, 732, 608, 608, 608, 608, 608, 608, 608, 608, -1542, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, 411, 411, 411, 411, 411, - 411, 411, 411, -205, -205, -205, -205, -205, -205, -205, -205, -1571, -1571, - -1571, -1571, -1571, -1571, -1571, -1571, - /* For Len=4 */ +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -758, -758, -758, -758, -758, -758, -758, -758, + -359, -359, -359, -359, -359, -359, -359, -359, + -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 287, 287, 287, 287, 287, 287, 287, 287, + 202, 202, 202, 202, 202, 202, 202, 202, + -171, -171, -171, -171, -171, -171, -171, -171, + 622, 622, 622, 622, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, + 182, 182, 182, 182, 182, 182, 182, 182, + 962, 962, 962, 962, 962, 962, 962, 962, + -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, + -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, + 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, + 573, 573, 573, 573, 573, 573, 573, 573, + -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, + 264, 264, 264, 264, 264, 264, 264, 264, + 383, 383, 383, 383, 383, 383, 383, 383, + -829, -829, -829, -829, -829, -829, -829, -829, + 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, + -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, + -130, -130, -130, -130, -130, -130, -130, -130, + -681, -681, -681, -681, -681, -681, -681, -681, + 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, + 732, 732, 732, 732, 732, 732, 732, 732, + 608, 608, 608, 608, 608, 608, 608, 608, + -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, + 411, 411, 411, 411, 411, 411, 411, 411, + -205, -205, -205, -205, -205, -205, -205, -205, + -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, 652, 652, 652, 652, 1223, 1223, 1223, 1223, 1015, 1015, 1015, 1015, -552, -552, -552, -552, 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, @@ -41,19 +59,19 @@ -725, -725, -725, -725, -1508, -1508, -1508, -1508, -1065, -1065, -1065, -1065, 448, 448, 448, 448, -1275, -1275, -1275, -1275, 677, 677, 677, 677, - /* - * For ntt Len=2 - * reorder zeta array, (1, 2, 3, 4) -> (3, 1, 4, 2) - * Transpose z[0], z[1], z[2], z[3] - * -> z[3], z[3], z[1], z[1], z[4], z[4], z[2], z[2] - */ - 555, 555, -1103, -1103, 843, 843, 430, 430, 1550, 1550, -1251, -1251, 105, - 105, 871, 871, 177, 177, 422, 422, -235, -235, 587, 587, 1574, 1574, -291, - -291, 1653, 1653, -460, -460, 1159, 1159, -246, -246, -147, -147, 778, 778, - -602, -602, -777, -777, 1119, 1119, 1483, 1483, -872, -872, -1590, -1590, - 349, 349, 644, 644, -156, -156, 418, 418, -75, -75, 329, 329, 603, 603, 817, - 817, 610, 610, 1097, 1097, -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, - 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, -1187, -1187, -874, - -874, -1659, -1659, 220, 220, -1278, -1278, -1185, -1185, 794, 794, -1530, - -1530, -870, -870, -1510, -1510, 478, 478, -854, -854, 996, 996, -108, -108, - 991, 991, -308, -308, 1522, 1522, 958, 958, 1628, 1628, -1460, -1460 + 555, 555, -1103, -1103, 843, 843, 430, 430, + 1550, 1550, -1251, -1251, 105, 105, 871, 871, + 177, 177, 422, 422, -235, -235, 587, 587, + 1574, 1574, -291, -291, 1653, 1653, -460, -460, + 1159, 1159, -246, -246, -147, -147, 778, 778, + -602, -602, -777, -777, 1119, 1119, 1483, 1483, + -872, -872, -1590, -1590, 349, 349, 644, 644, + -156, -156, 418, 418, -75, -75, 329, 329, + 603, 603, 817, 817, 610, 610, 1097, 1097, + -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, + 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, + -1187, -1187, -874, -874, -1659, -1659, 220, 220, + -1278, -1278, -1185, -1185, 794, 794, -1530, -1530, + -870, -870, -1510, -1510, 478, 478, -854, -854, + 996, 996, -108, -108, 991, 991, -308, -308, + 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, diff --git a/scripts/autogen b/scripts/autogen index 1c48171aff..ff9e1f3f82 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -497,7 +497,7 @@ class CondParser: def parse_condition(self, exp, simplify=True): try: - exp = self.parser.parseString(exp, parseAll=True).as_list()[0] + exp = self.parser.parse_string(exp, parse_all=True).as_list()[0] except pp.ParseException: print(f"WARNING: Ignoring condition '{exp}' I cannot parse") return exp @@ -2029,6 +2029,109 @@ def gen_riscv64_zeta_files(): ) +# The PPC64LE backend stores its twiddle factors in two include files, +# `consts_ntt.inc` and `consts_intt.inc`, which are concatenated into the +# `mlk_ppc_qdata` table in `consts.c`. The values are the canonical Montgomery +# zetas (matching `mlk_zetas` in `mlkem/src/zetas.inc`), arranged per layer to +# match the order in which the assembly consumes them. +# +# NTT layout (in order): +# - Layers Len=128, 64, 32, 16, 8: 31 zetas, each broadcast 8 times, +# in canonical order (`mlk_zetas[1..31]`). +# - Layer Len=4: 32 zetas, each broadcast 4 times, with adjacent pairs +# swapped (`mlk_zetas[32 + (i^1)]`). +# - Layer Len=2: 64 zetas (`mlk_zetas[64..127]`), grouped in 16 quadruples; +# each quadruple is permuted by (1,2,3,4) -> (3,1,4,2) and each entry +# duplicated to fill an 8-halfword vector. +# +# Inverse NTT layout (in order): +# - Layer Len=2: same permutation/duplication as the NTT, but applied to +# `mlk_zetas[127..64]` grouped in quadruples of decreasing index. +# - Layer Len=4: 32 zetas, each broadcast 4 times, in reverse canonical +# order with adjacent pairs swapped (`mlk_zetas[63 - (i^1)]`). +# - Layers Len=8, 16, 32, 64, 128: 31 zetas, each broadcast 8 times, +# in reverse canonical order (`mlk_zetas[31..1]`). +_PPC64LE_LEN2_PERM = [2, 0, 3, 1] + + +def gen_ppc64le_ntt_zetas(): + z = list(gen_c_zetas()) + + # Layers Len=128, 64, 32, 16, 8: broadcast x 8. + for i in range(1, 32): + yield from [z[i]] * 8 + + # Layer Len=4: broadcast x 4, adjacent pairs swapped. + for i in range(32): + yield from [z[32 + (i ^ 1)]] * 4 + + # Layer Len=2: per group of 4 zetas, permute (3,1,4,2) and duplicate each. + for g in range(16): + src = z[64 + g * 4 : 64 + (g + 1) * 4] + for p in _PPC64LE_LEN2_PERM: + yield from [src[p]] * 2 + + +def gen_ppc64le_intt_zetas(): + z = list(gen_c_zetas()) + + # Layer Len=2: per group of 4 reverse-ordered zetas, same permutation + # and duplication as the NTT. + for g in range(16): + src = [z[127 - g * 4 - i] for i in range(4)] + for p in _PPC64LE_LEN2_PERM: + yield from [src[p]] * 2 + + # Layer Len=4: broadcast x 4, reverse canonical order, adjacent pairs + # swapped. + for i in range(32): + yield from [z[63 - (i ^ 1)]] * 4 + + # Layers Len=8, 16, 32, 64, 128: broadcast x 8, reverse canonical order. + for i in range(31): + yield from [z[31 - i]] * 8 + + +def gen_ppc64le_zeta_files(): + """Generate PPC64LE zeta include files.""" + + def gen_inc(zetas, leading_comment, entries_per_line=8): + yield from gen_header() + yield leading_comment + zetas = list(zetas) + for i in range(0, len(zetas), entries_per_line): + chunk = zetas[i : i + entries_per_line] + yield " " + ", ".join(str(t) for t in chunk) + "," + yield "" + + ntt_content = "\n".join( + gen_inc( + gen_ppc64le_ntt_zetas(), + "/* Twiddle factors for the PPC64LE forward NTT.\n * See autogen for details.\n */", + ) + ) + intt_content = "\n".join( + gen_inc( + gen_ppc64le_intt_zetas(), + "/* Twiddle factors for the PPC64LE inverse NTT.\n * See autogen for details.\n */", + ) + ) + + # The .inc files are #include'd by `consts.c` (not by an .S file), so they + # are not inlined via simpasm; we therefore write them directly into both + # the developer tree and the mlkem mirror. + for path in ( + "dev/ppc64le/src/consts_ntt.inc", + "mlkem/src/native/ppc64le/src/consts_ntt.inc", + ): + update_file(path, ntt_content) + for path in ( + "dev/ppc64le/src/consts_intt.inc", + "mlkem/src/native/ppc64le/src/consts_intt.inc", + ): + update_file(path, intt_content) + + def get_c_source_files(main_only=False, core_only=False, strip_mlkem=False): if main_only is True: return get_files("mlkem/src/**/*.c", strip_mlkem=strip_mlkem) @@ -2864,6 +2967,8 @@ def update_via_simpasm( source_arch = "x86_64" elif "armv81m" in infile_full: source_arch = "armv81m" + elif "ppc64le" in infile_full: + source_arch = "ppc64le" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -2881,13 +2986,21 @@ def update_via_simpasm( return raise Exception(f"Could not find cross toolchain {cross_prefix}") elif native_arch != source_arch: - cross_prefix = f"{source_arch}-unknown-linux-gnu-" + # PPC64LE uses "powerpc64le" in the GNU triple, not the bare "ppc64le". + arch_triple = "powerpc64le" if source_arch == "ppc64le" else source_arch + cross_prefix = f"{arch_triple}-unknown-linux-gnu-" cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: - if force_cross is False: + if "--target=" in (cflags or ""): + # No cross-gcc, but an explicit target triple is already in + # cflags (e.g. via --cflags on macOS). Let simpasm + # use its default cc/nm (e.g. clang + llvm-nm on Darwin). + cross_prefix = None + elif force_cross is False: return - raise Exception(f"Could not find cross toolchain {cross_prefix}") + else: + raise Exception(f"Could not find cross toolchain {cross_prefix}") else: cross_prefix = None @@ -2898,6 +3011,8 @@ def update_via_simpasm( arch = "aarch64" elif "armv81m" in infile_full: arch = "armv81m" + elif "ppc64le" in infile_full: + arch = "ppc64le" else: arch = "x86_64" @@ -2910,8 +3025,8 @@ def update_via_simpasm( "-o", tmp.name, ] - # TODO: Support CFI for Armv8.1-M - if arch != "armv81m": + # TODO: Support CFI for Armv8.1-M and ppc64le + if arch not in ("armv81m", "ppc64le"): cmd += ["--cfify"] if cross_prefix is not None: # Stick with llvm-objdump for disassembly @@ -3306,6 +3421,7 @@ def synchronize_backends( delete=False, no_simplify=False, x86_64_syntax="att", + extra_cflags=None, ): if clean is False: ty = "opt" @@ -3411,6 +3527,14 @@ def synchronize_backends( x86_64_syntax=x86_64_syntax, cflags="-Idev/fips202/x86_64 -Imlkem/src/fips202/native/x86_64 -mavx2 -mbmi2 -msse4 -fcf-protection=none", ) + synchronize_backend( + "dev/ppc64le/src", + "mlkem/src/native/ppc64le/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags=" ".join(filter(None, [extra_cflags, "-Idev/ppc64le/src -Imlkem/src/native/ppc64le/src -mcpu=power8"])), + ) def adjust_header_guard_for_filename(content, header_file): @@ -4216,6 +4340,13 @@ def _main(): default="att", help="Assembly syntax for x86_64 disassembly output (att or intel)", ) + parser.add_argument( + "--cflags", + type=str, + default=None, + metavar="FLAGS", + help="Extra CFLAGS prepended to ppc64le simpasm invocations", + ) args = parser.parse_args() @@ -4230,6 +4361,7 @@ def _main(): no_simplify=args.no_simplify, force_cross=args.force_cross, x86_64_syntax=args.x86_64_syntax, + extra_cflags=args.cflags, ) def sync_backends_final(): @@ -4239,6 +4371,7 @@ def _main(): force_cross=args.force_cross, no_simplify=args.no_simplify, x86_64_syntax=args.x86_64_syntax, + extra_cflags=args.cflags, ) # Build step list: (description, function, enabled) @@ -4270,6 +4403,7 @@ def _main(): gen_avx2_keccak_constants_c_file() gen_avx2_keccak_hol_light_constants_file() gen_riscv64_zeta_files() + gen_ppc64le_zeta_files() def gen_monolithic(): gen_monolithic_source_file() diff --git a/scripts/simpasm b/scripts/simpasm index ed88f826ca..cfcda590d6 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -364,8 +364,8 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Checking that byte-code is unchanged ...") - # When CFI is enabled or for Armv8.1-M, compare only the __text section content - if args.cfify or args.arch == "armv81m": + # When CFI is enabled or for Armv8.1-M/ppc64le, compare only the __text section content + if args.cfify or args.arch in ("armv81m", "ppc64le"): logger.debug("Comparing __text section content for CFI comparison...") # Extract __text section from both files @@ -433,7 +433,9 @@ def _main(): parser.add_argument( "--cc", type=str, default="gcc" if platform.system() != "Darwin" else "clang" ) - parser.add_argument("--nm", type=str, default="nm") + parser.add_argument( + "--nm", type=str, default="nm" if platform.system() != "Darwin" else "llvm-nm" + ) parser.add_argument("--objdump", type=str, default="objdump") parser.add_argument("--strip", type=str, default="llvm-strip") parser.add_argument("--cflags", type=str) From 47024dd0ab62fc7216c192cec1a103b1d705eabc Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Fri, 8 May 2026 09:17:59 +0200 Subject: [PATCH 21/27] rerun autogen Signed-off-by: Danny Tsen --- dev/ppc64le/src/intt_ppc_asm.S | 4 ++-- dev/ppc64le/src/ntt_ppc_asm.S | 4 ++-- dev/ppc64le/src/poly_tomont_ppc_asm.S | 4 ++-- dev/ppc64le/src/reduce_ppc_asm.S | 4 ++-- mlkem/mlkem_native_asm.S | 8 ++++---- mlkem/src/native/ppc64le/src/intt_ppc_asm.S | 6 +++--- mlkem/src/native/ppc64le/src/ntt_ppc_asm.S | 6 +++--- mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S | 6 +++--- mlkem/src/native/ppc64le/src/reduce_ppc_asm.S | 6 +++--- 9 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dev/ppc64le/src/intt_ppc_asm.S b/dev/ppc64le/src/intt_ppc_asm.S index 271d3e27fa..0c0a4d9fca 100644 --- a/dev/ppc64le/src/intt_ppc_asm.S +++ b/dev/ppc64le/src/intt_ppc_asm.S @@ -821,5 +821,5 @@ intt_ppc_asm_Loopf: #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/ntt_ppc_asm.S b/dev/ppc64le/src/ntt_ppc_asm.S index 56f48f6080..2e54c8f84f 100644 --- a/dev/ppc64le/src/ntt_ppc_asm.S +++ b/dev/ppc64le/src/ntt_ppc_asm.S @@ -646,5 +646,5 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) #undef b4_offset /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/poly_tomont_ppc_asm.S b/dev/ppc64le/src/poly_tomont_ppc_asm.S index 12779545a6..fdf1306924 100644 --- a/dev/ppc64le/src/poly_tomont_ppc_asm.S +++ b/dev/ppc64le/src/poly_tomont_ppc_asm.S @@ -182,5 +182,5 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) #undef V_NMKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/reduce_ppc_asm.S b/dev/ppc64le/src/reduce_ppc_asm.S index 51490d1f9c..4650310f3e 100644 --- a/dev/ppc64le/src/reduce_ppc_asm.S +++ b/dev/ppc64le/src/reduce_ppc_asm.S @@ -228,5 +228,5 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) #undef V_MKQ /* simpasm: footer-start */ -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index 74bc7fb467..9fddfc4da1 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -96,10 +96,10 @@ #if defined(MLK_SYS_RISCV64) #endif #if defined(MLK_SYS_PPC64LE) -#include "src/native/ppc64le/src/intt_ppc.S" -#include "src/native/ppc64le/src/ntt_ppc.S" -#include "src/native/ppc64le/src/poly_tomont.S" -#include "src/native/ppc64le/src/reduce.S" +#include "src/native/ppc64le/src/intt_ppc_asm.S" +#include "src/native/ppc64le/src/ntt_ppc_asm.S" +#include "src/native/ppc64le/src/poly_tomont_ppc_asm.S" +#include "src/native/ppc64le/src/reduce_ppc_asm.S" #endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S index ae08438d8e..ddd6d97b87 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -14,7 +14,7 @@ /* * WARNING: This file is auto-derived from the mlkem-native source file - * dev/ppc64le/src/intt_ppc.S using scripts/simpasm. Do not modify it directly. + * dev/ppc64le/src/intt_ppc_asm.S using scripts/simpasm. Do not modify it directly. */ .text @@ -3411,8 +3411,8 @@ intt_ppc_asm_Loopf: MLK_ASM_FN_SIZE(intt_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ #if defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S index d017871578..31c4595357 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -14,7 +14,7 @@ /* * WARNING: This file is auto-derived from the mlkem-native source file - * dev/ppc64le/src/ntt_ppc.S using scripts/simpasm. Do not modify it directly. + * dev/ppc64le/src/ntt_ppc_asm.S using scripts/simpasm. Do not modify it directly. */ .text @@ -1784,8 +1784,8 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) MLK_ASM_FN_SIZE(ntt_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ #if defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S index acb25c4858..6739a61505 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -22,7 +22,7 @@ /* * WARNING: This file is auto-derived from the mlkem-native source file - * dev/ppc64le/src/poly_tomont.S using scripts/simpasm. Do not modify it directly. + * dev/ppc64le/src/poly_tomont_ppc_asm.S using scripts/simpasm. Do not modify it directly. */ .text @@ -354,8 +354,8 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) MLK_ASM_FN_SIZE(poly_tomont_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ #if defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S index 9715b76d64..f084651bff 100644 --- a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S @@ -14,7 +14,7 @@ /* * WARNING: This file is auto-derived from the mlkem-native source file - * dev/ppc64le/src/reduce.S using scripts/simpasm. Do not modify it directly. + * dev/ppc64le/src/reduce_ppc_asm.S using scripts/simpasm. Do not modify it directly. */ .text @@ -705,8 +705,8 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) MLK_ASM_FN_SIZE(reduce_ppc_asm) -#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ \ - */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ #if defined(__ELF__) .section .note.GNU-stack,"",%progbits From ca81bf5b6ab5dc4ea8353f9665b8c59fada50094 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Wed, 13 May 2026 13:00:20 +0200 Subject: [PATCH 22/27] Use Barrett for NTT twiddle products, and calculate the corresponding twisted zetas. Signed-off-by: Danny Tsen --- dev/ppc64le/src/consts.c | 35 +- dev/ppc64le/src/consts.h | 11 +- dev/ppc64le/src/consts_intt.inc | 126 +- dev/ppc64le/src/consts_intt_tw.inc | 77 ++ dev/ppc64le/src/consts_ntt.inc | 126 +- dev/ppc64le/src/consts_ntt_tw.inc | 77 ++ dev/ppc64le/src/intt_ppc_asm.S | 255 ++-- dev/ppc64le/src/ntt_ppc_asm.S | 168 +-- mlkem/mlkem_native.c | 5 +- mlkem/mlkem_native_asm.S | 5 +- mlkem/src/native/ppc64le/src/consts.c | 35 +- mlkem/src/native/ppc64le/src/consts.h | 11 +- mlkem/src/native/ppc64le/src/consts_intt.inc | 126 +- .../src/native/ppc64le/src/consts_intt_tw.inc | 77 ++ mlkem/src/native/ppc64le/src/consts_ntt.inc | 126 +- .../src/native/ppc64le/src/consts_ntt_tw.inc | 77 ++ mlkem/src/native/ppc64le/src/intt_ppc_asm.S | 1079 +++++++---------- mlkem/src/native/ppc64le/src/ntt_ppc_asm.S | 993 +++++++-------- .../native/ppc64le/src/poly_tomont_ppc_asm.S | 2 +- scripts/autogen | 67 +- 20 files changed, 1789 insertions(+), 1689 deletions(-) create mode 100644 dev/ppc64le/src/consts_intt_tw.inc create mode 100644 dev/ppc64le/src/consts_ntt_tw.inc create mode 100644 mlkem/src/native/ppc64le/src/consts_intt_tw.inc create mode 100644 mlkem/src/native/ppc64le/src/consts_ntt_tw.inc diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 74c8aa441c..4e4a3dde72 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -49,15 +49,28 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { 20159, 20159, 20159, - /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, + /* N^-1 in Montgomery form: pow(128,-1,MLKEM_Q) * 2^16 mod MLKEM_Q = 512. + * Multiplying by this via Barrett-fqmul scales INTT output by N^-1 and + * leaves it in Montgomery form (mlk_poly_invntt_tomont contract). */ + 512, + 512, + 512, + 512, + 512, + 512, + 512, + 512, + /* check-magic: 5040 == round((512 * 2**16 + MLKEM_Q) / MLKEM_Q) // 2 */ + /* Barrett twist of N^-1*R = round_to_even(N_INV_MONT * 2^16 / MLKEM_Q) / 2 + */ + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ 1353, 1353, @@ -71,6 +84,10 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { #include "consts_ntt.inc" /* zetas for invNTT */ #include "consts_intt.inc" +/* twisted zetas for NTT (Barrett high-mul) */ +#include "consts_ntt_tw.inc" +/* twisted zetas for invNTT (Barrett high-mul) */ +#include "consts_intt_tw.inc" }; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index 704a372b8a..1622e8ccbb 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -13,10 +13,13 @@ #define MLK_PPC_QINV_OFFSET 16 #define MLK_PPC_Q_OFFSET 32 #define MLK_PPC_C20159_OFFSET 48 -#define MLK_PPC_C1441_OFFSET 64 -#define MLK_PPC_C1353_OFFSET 80 -#define MLK_PPC_ZETA_NTT_OFFSET 96 -#define MLK_PPC_ZETA_INTT_OFFSET 1104 +#define MLK_PPC_N_INV_OFFSET 64 +#define MLK_PPC_N_INV_TW_OFFSET 80 +#define MLK_PPC_C1353_OFFSET 96 +#define MLK_PPC_ZETA_NTT_OFFSET 112 +#define MLK_PPC_ZETA_INTT_OFFSET 1120 +#define MLK_PPC_ZETA_NTT_TW_OFFSET 2128 +#define MLK_PPC_ZETA_INTT_TW_OFFSET 3136 /* check-magic: on */ #ifndef __ASSEMBLER__ diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc index 0e1dd4367e..057b1df249 100644 --- a/dev/ppc64le/src/consts_intt.inc +++ b/dev/ppc64le/src/consts_intt.inc @@ -12,66 +12,66 @@ /* Twiddle factors for the PPC64LE inverse NTT. * See autogen for details. */ - -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, - -308, -308, 991, 991, -108, -108, 996, 996, - -854, -854, 478, 478, -1510, -1510, -870, -870, - -1530, -1530, 794, 794, -1185, -1185, -1278, -1278, - 220, 220, -1659, -1659, -874, -874, -1187, -1187, - -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, - -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, - 1097, 1097, 610, 610, 817, 817, 603, 603, - 329, 329, -75, -75, 418, 418, -156, -156, - 644, 644, 349, 349, -1590, -1590, -872, -872, - 1483, 1483, 1119, 1119, -777, -777, -602, -602, - 778, 778, -147, -147, -246, -246, 1159, 1159, - -460, -460, 1653, 1653, -291, -291, 1574, 1574, - 587, 587, -235, -235, 422, 422, 177, 177, - 871, 871, 105, 105, -1251, -1251, 1550, 1550, - 430, 430, 843, 843, -1103, -1103, 555, 555, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, - 448, 448, 448, 448, -1065, -1065, -1065, -1065, - -1508, -1508, -1508, -1508, -725, -725, -725, -725, - -398, -398, -398, -398, 961, 961, 961, 961, - -247, -247, -247, -247, -951, -951, -951, -951, - 107, 107, 107, 107, -1421, -1421, -1421, -1421, - -271, -271, -271, -271, 830, 830, 830, 830, - -853, -853, -853, -853, -90, -90, -90, -90, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, - -1618, -1618, -1618, -1618, -1162, -1162, -1162, -1162, - -320, -320, -320, -320, -666, -666, -666, -666, - 516, 516, 516, 516, -8, -8, -8, -8, - -282, -282, -282, -282, -1544, -1544, -1544, -1544, - -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, - -552, -552, -552, -552, 1015, 1015, 1015, 1015, - 1223, 1223, 1223, 1223, 652, 652, 652, 652, - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, - -205, -205, -205, -205, -205, -205, -205, -205, - 411, 411, 411, 411, 411, 411, 411, 411, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - 608, 608, 608, 608, 608, 608, 608, 608, - 732, 732, 732, 732, 732, 732, 732, 732, - 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, - -681, -681, -681, -681, -681, -681, -681, -681, - -130, -130, -130, -130, -130, -130, -130, -130, - -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, - 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - -829, -829, -829, -829, -829, -829, -829, -829, - 383, 383, 383, 383, 383, 383, 383, 383, - 264, 264, 264, 264, 264, 264, 264, 264, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, - 573, 573, 573, 573, 573, 573, 573, 573, - 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, - -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - 962, 962, 962, 962, 962, 962, 962, 962, - 182, 182, 182, 182, 182, 182, 182, 182, - 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, - 622, 622, 622, 622, 622, 622, 622, 622, - -171, -171, -171, -171, -171, -171, -171, -171, - 202, 202, 202, 202, 202, 202, 202, 202, - 287, 287, 287, 287, 287, 287, 287, 287, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -359, -359, -359, -359, -359, -359, -359, -359, - -758, -758, -758, -758, -758, -758, -758, -758, + -394, -394, -1175, -1175, -1219, -1219, 885, 885, + 1212, 1212, 1029, 1029, -1607, -1607, -1455, -1455, + -1179, -1179, 886, 886, 1143, 1143, -554, -554, + 1092, 1092, 1026, 1026, -525, -525, 403, 403, + 561, 561, -735, -735, -1230, -1230, -863, -863, + 319, 319, 757, 757, 1063, 1063, -556, -556, + -780, -780, 1645, 1645, 375, 375, -1239, -1239, + -1031, -1031, -109, -109, 1584, 1584, -1292, -1292, + -992, -992, 641, 641, 733, 733, 268, 268, + -1021, -1021, -941, -941, 939, 939, -892, -892, + 952, 952, -642, -642, -1482, -1482, 1461, 1461, + 1651, 1651, -1540, -1540, -1626, -1626, -540, -540, + -1173, -1173, -279, -279, 756, 756, -314, -314, + -667, -667, 233, 233, 1409, 1409, -48, -48, + 723, 723, 1100, 1100, 1637, 1637, -1041, -1041, + -568, -568, -680, -680, 17, 17, 583, 583, + 1227, 1227, 1227, 1227, 910, 910, 910, 910, + -855, -855, -855, -855, -219, -219, -219, -219, + 1481, 1481, 1481, 1481, 648, 648, 648, 648, + -682, -682, -682, -682, -712, -712, -712, -712, + 1534, 1534, 1534, 1534, -927, -927, -927, -927, + 1438, 1438, 1438, 1438, -461, -461, -461, -461, + 807, 807, 807, 807, 452, 452, 452, 452, + -1010, -1010, -1010, -1010, 1435, 1435, 1435, 1435, + 1320, 1320, 1320, 1320, -1414, -1414, -1414, -1414, + -464, -464, -464, -464, 33, 33, 33, 33, + -816, -816, -816, -816, 632, 632, 632, 632, + 650, 650, 650, 650, -1352, -1352, -1352, -1352, + -1052, -1052, -1052, -1052, -1274, -1274, -1274, -1274, + 1197, 1197, 1197, 1197, -1025, -1025, -1025, -1025, + -76, -76, -76, -76, -1573, -1573, -1573, -1573, + 289, 289, 289, 289, 331, 331, 331, 331, + 821, 821, 821, 821, 821, 821, 821, 821, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + -450, -450, -450, -450, -450, -450, -450, -450, + -936, -936, -936, -936, -936, -936, -936, -936, + -447, -447, -447, -447, -447, -447, -447, -447, + 535, 535, 535, 535, 535, 535, 535, 535, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 56, 56, 56, 56, 56, 56, 56, 56, + -283, -283, -283, -283, -283, -283, -283, -283, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + -882, -882, -882, -882, -882, -882, -882, -882, + 296, 296, 296, 296, 296, 296, 296, 296, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 569, 569, 569, 569, 569, 569, 569, 569, + -69, -69, -69, -69, -69, -69, -69, -69, + -543, -543, -543, -543, -543, -543, -543, -543, + 797, 797, 797, 797, 797, 797, 797, 797, + 193, 193, 193, 193, 193, 193, 193, 193, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + 848, 848, 848, 848, 848, 848, 848, 848, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 630, 630, 630, 630, 630, 630, 630, 630, + -687, -687, -687, -687, -687, -687, -687, -687, + -40, -40, -40, -40, -40, -40, -40, -40, + -749, -749, -749, -749, -749, -749, -749, -749, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, diff --git a/dev/ppc64le/src/consts_intt_tw.inc b/dev/ppc64le/src/consts_intt_tw.inc new file mode 100644 index 0000000000..783e0af8f2 --- /dev/null +++ b/dev/ppc64le/src/consts_intt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -3878, -3878, -11566, -11566, -11999, -11999, 8711, 8711, + 11930, 11930, 10129, 10129, -15818, -15818, -14322, -14322, + -11605, -11605, 8721, 8721, 11251, 11251, -5453, -5453, + 10749, 10749, 10099, 10099, -5168, -5168, 3967, 3967, + 5522, 5522, -7235, -7235, -12107, -12107, -8495, -8495, + 3140, 3140, 7451, 7451, 10463, 10463, -5473, -5473, + -7678, -7678, 16192, 16192, 3691, 3691, -12196, -12196, + -10148, -10148, -1073, -1073, 15592, 15592, -12717, -12717, + -9764, -9764, 6309, 6309, 7215, 7215, 2638, 2638, + -10050, -10050, -9262, -9262, 9243, 9243, -8780, -8780, + 9371, 9371, -6319, -6319, -14588, -14588, 14381, 14381, + 16251, 16251, -15159, -15159, -16005, -16005, -5315, -5315, + -11546, -11546, -2746, -2746, 7441, 7441, -3091, -3091, + -6565, -6565, 2293, 2293, 13869, 13869, -472, -472, + 7117, 7117, 10828, 10828, 16113, 16113, -10247, -10247, + -5591, -5591, -6693, -6693, 167, 167, 5739, 5739, + 12078, 12078, 12078, 12078, 8957, 8957, 8957, 8957, + -8416, -8416, -8416, -8416, -2156, -2156, -2156, -2156, + 14578, 14578, 14578, 14578, 6378, 6378, 6378, 6378, + -6713, -6713, -6713, -6713, -7008, -7008, -7008, -7008, + 15099, 15099, 15099, 15099, -9125, -9125, -9125, -9125, + 14155, 14155, 14155, 14155, -4538, -4538, -4538, -4538, + 7943, 7943, 7943, 7943, 4449, 4449, 4449, 4449, + -9942, -9942, -9942, -9942, 14125, 14125, 14125, 14125, + 12993, 12993, 12993, 12993, -13918, -13918, -13918, -13918, + -4567, -4567, -4567, -4567, 325, 325, 325, 325, + -8032, -8032, -8032, -8032, 6221, 6221, 6221, 6221, + 6398, 6398, 6398, 6398, -13308, -13308, -13308, -13308, + -10355, -10355, -10355, -10355, -12540, -12540, -12540, -12540, + 11782, 11782, 11782, 11782, -10089, -10089, -10089, -10089, + -748, -748, -748, -748, -15483, -15483, -15483, -15483, + 2845, 2845, 2845, 2845, 3258, 3258, 3258, 3258, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 551, 551, 551, 551, 551, 551, 551, 551, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -679, -679, -679, -679, -679, -679, -679, -679, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + -394, -394, -394, -394, -394, -394, -394, -394, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc index 2729155aab..e53bf13713 100644 --- a/dev/ppc64le/src/consts_ntt.inc +++ b/dev/ppc64le/src/consts_ntt.inc @@ -12,66 +12,66 @@ /* Twiddle factors for the PPC64LE forward NTT. * See autogen for details. */ - -758, -758, -758, -758, -758, -758, -758, -758, - -359, -359, -359, -359, -359, -359, -359, -359, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 287, 287, 287, 287, 287, 287, 287, 287, - 202, 202, 202, 202, 202, 202, 202, 202, - -171, -171, -171, -171, -171, -171, -171, -171, - 622, 622, 622, 622, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, - 182, 182, 182, 182, 182, 182, 182, 182, - 962, 962, 962, 962, 962, 962, 962, 962, - -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, - 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, - 264, 264, 264, 264, 264, 264, 264, 264, - 383, 383, 383, 383, 383, 383, 383, 383, - -829, -829, -829, -829, -829, -829, -829, -829, - 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, - -130, -130, -130, -130, -130, -130, -130, -130, - -681, -681, -681, -681, -681, -681, -681, -681, - 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, - 732, 732, 732, 732, 732, 732, 732, 732, - 608, 608, 608, 608, 608, 608, 608, 608, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - 411, 411, 411, 411, 411, 411, 411, 411, - -205, -205, -205, -205, -205, -205, -205, -205, - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, - 652, 652, 652, 652, 1223, 1223, 1223, 1223, - 1015, 1015, 1015, 1015, -552, -552, -552, -552, - 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, - -1544, -1544, -1544, -1544, -282, -282, -282, -282, - -8, -8, -8, -8, 516, 516, 516, 516, - -666, -666, -666, -666, -320, -320, -320, -320, - -1162, -1162, -1162, -1162, -1618, -1618, -1618, -1618, - 1469, 1469, 1469, 1469, 126, 126, 126, 126, - -90, -90, -90, -90, -853, -853, -853, -853, - 830, 830, 830, 830, -271, -271, -271, -271, - -1421, -1421, -1421, -1421, 107, 107, 107, 107, - -951, -951, -951, -951, -247, -247, -247, -247, - 961, 961, 961, 961, -398, -398, -398, -398, - -725, -725, -725, -725, -1508, -1508, -1508, -1508, - -1065, -1065, -1065, -1065, 448, 448, 448, 448, - -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 555, 555, -1103, -1103, 843, 843, 430, 430, - 1550, 1550, -1251, -1251, 105, 105, 871, 871, - 177, 177, 422, 422, -235, -235, 587, 587, - 1574, 1574, -291, -291, 1653, 1653, -460, -460, - 1159, 1159, -246, -246, -147, -147, 778, 778, - -602, -602, -777, -777, 1119, 1119, 1483, 1483, - -872, -872, -1590, -1590, 349, 349, 644, 644, - -156, -156, 418, 418, -75, -75, 329, 329, - 603, 603, 817, 817, 610, 610, 1097, 1097, - -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, - 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, - -1187, -1187, -874, -874, -1659, -1659, 220, 220, - -1278, -1278, -1185, -1185, 794, 794, -1530, -1530, - -870, -870, -1510, -1510, 478, 478, -854, -854, - 996, 996, -108, -108, 991, 991, -308, -308, - 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, + -749, -749, -749, -749, -749, -749, -749, -749, + -40, -40, -40, -40, -40, -40, -40, -40, + -687, -687, -687, -687, -687, -687, -687, -687, + 630, 630, 630, 630, 630, 630, 630, 630, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 848, 848, 848, 848, 848, 848, 848, 848, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 193, 193, 193, 193, 193, 193, 193, 193, + 797, 797, 797, 797, 797, 797, 797, 797, + -543, -543, -543, -543, -543, -543, -543, -543, + -69, -69, -69, -69, -69, -69, -69, -69, + 569, 569, 569, 569, 569, 569, 569, 569, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 296, 296, 296, 296, 296, 296, 296, 296, + -882, -882, -882, -882, -882, -882, -882, -882, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + -283, -283, -283, -283, -283, -283, -283, -283, + 56, 56, 56, 56, 56, 56, 56, 56, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 535, 535, 535, 535, 535, 535, 535, 535, + -447, -447, -447, -447, -447, -447, -447, -447, + -936, -936, -936, -936, -936, -936, -936, -936, + -450, -450, -450, -450, -450, -450, -450, -450, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + 821, 821, 821, 821, 821, 821, 821, 821, + 331, 331, 331, 331, 289, 289, 289, 289, + -1573, -1573, -1573, -1573, -76, -76, -76, -76, + -1025, -1025, -1025, -1025, 1197, 1197, 1197, 1197, + -1274, -1274, -1274, -1274, -1052, -1052, -1052, -1052, + -1352, -1352, -1352, -1352, 650, 650, 650, 650, + 632, 632, 632, 632, -816, -816, -816, -816, + 33, 33, 33, 33, -464, -464, -464, -464, + -1414, -1414, -1414, -1414, 1320, 1320, 1320, 1320, + 1435, 1435, 1435, 1435, -1010, -1010, -1010, -1010, + 452, 452, 452, 452, 807, 807, 807, 807, + -461, -461, -461, -461, 1438, 1438, 1438, 1438, + -927, -927, -927, -927, 1534, 1534, 1534, 1534, + -712, -712, -712, -712, -682, -682, -682, -682, + 648, 648, 648, 648, 1481, 1481, 1481, 1481, + -219, -219, -219, -219, -855, -855, -855, -855, + 910, 910, 910, 910, 1227, 1227, 1227, 1227, + 583, 583, 17, 17, -680, -680, -568, -568, + -1041, -1041, 1637, 1637, 1100, 1100, 723, 723, + -48, -48, 1409, 1409, 233, 233, -667, -667, + -314, -314, 756, 756, -279, -279, -1173, -1173, + -540, -540, -1626, -1626, -1540, -1540, 1651, 1651, + 1461, 1461, -1482, -1482, -642, -642, 952, 952, + -892, -892, 939, 939, -941, -941, -1021, -1021, + 268, 268, 733, 733, 641, 641, -992, -992, + -1292, -1292, 1584, 1584, -109, -109, -1031, -1031, + -1239, -1239, 375, 375, 1645, 1645, -780, -780, + -556, -556, 1063, 1063, 757, 757, 319, 319, + -863, -863, -1230, -1230, -735, -735, 561, 561, + 403, 403, -525, -525, 1026, 1026, 1092, 1092, + -554, -554, 1143, 1143, 886, 886, -1179, -1179, + -1455, -1455, -1607, -1607, 1029, 1029, 1212, 1212, + 885, 885, -1219, -1219, -1175, -1175, -394, -394, diff --git a/dev/ppc64le/src/consts_ntt_tw.inc b/dev/ppc64le/src/consts_ntt_tw.inc new file mode 100644 index 0000000000..a191b3bf2f --- /dev/null +++ b/dev/ppc64le/src/consts_ntt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -394, -394, -394, -394, -394, -394, -394, -394, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + -679, -679, -679, -679, -679, -679, -679, -679, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 551, 551, 551, 551, 551, 551, 551, 551, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + 3258, 3258, 3258, 3258, 2845, 2845, 2845, 2845, + -15483, -15483, -15483, -15483, -748, -748, -748, -748, + -10089, -10089, -10089, -10089, 11782, 11782, 11782, 11782, + -12540, -12540, -12540, -12540, -10355, -10355, -10355, -10355, + -13308, -13308, -13308, -13308, 6398, 6398, 6398, 6398, + 6221, 6221, 6221, 6221, -8032, -8032, -8032, -8032, + 325, 325, 325, 325, -4567, -4567, -4567, -4567, + -13918, -13918, -13918, -13918, 12993, 12993, 12993, 12993, + 14125, 14125, 14125, 14125, -9942, -9942, -9942, -9942, + 4449, 4449, 4449, 4449, 7943, 7943, 7943, 7943, + -4538, -4538, -4538, -4538, 14155, 14155, 14155, 14155, + -9125, -9125, -9125, -9125, 15099, 15099, 15099, 15099, + -7008, -7008, -7008, -7008, -6713, -6713, -6713, -6713, + 6378, 6378, 6378, 6378, 14578, 14578, 14578, 14578, + -2156, -2156, -2156, -2156, -8416, -8416, -8416, -8416, + 8957, 8957, 8957, 8957, 12078, 12078, 12078, 12078, + 5739, 5739, 167, 167, -6693, -6693, -5591, -5591, + -10247, -10247, 16113, 16113, 10828, 10828, 7117, 7117, + -472, -472, 13869, 13869, 2293, 2293, -6565, -6565, + -3091, -3091, 7441, 7441, -2746, -2746, -11546, -11546, + -5315, -5315, -16005, -16005, -15159, -15159, 16251, 16251, + 14381, 14381, -14588, -14588, -6319, -6319, 9371, 9371, + -8780, -8780, 9243, 9243, -9262, -9262, -10050, -10050, + 2638, 2638, 7215, 7215, 6309, 6309, -9764, -9764, + -12717, -12717, 15592, 15592, -1073, -1073, -10148, -10148, + -12196, -12196, 3691, 3691, 16192, 16192, -7678, -7678, + -5473, -5473, 10463, 10463, 7451, 7451, 3140, 3140, + -8495, -8495, -12107, -12107, -7235, -7235, 5522, 5522, + 3967, 3967, -5168, -5168, 10099, 10099, 10749, 10749, + -5453, -5453, 11251, 11251, 8721, 8721, -11605, -11605, + -14322, -14322, -15818, -15818, 10129, 10129, 11930, 11930, + 8711, 8711, -11999, -11999, -11566, -11566, -3878, -3878, diff --git a/dev/ppc64le/src/intt_ppc_asm.S b/dev/ppc64le/src/intt_ppc_asm.S index 0c0a4d9fca..bd92cdad50 100644 --- a/dev/ppc64le/src/intt_ppc_asm.S +++ b/dev/ppc64le/src/intt_ppc_asm.S @@ -17,21 +17,35 @@ .text -/* Barrett reduce constants */ +/* Barrett-Q-reduce constants */ #define V20159 0 #define V2pw25 1 #define V_26 2 #define V_MKQ 3 -/* Montgomery reduce constants */ -#define V_QINV 2 +/* Shared zero vector (aliases V_MKQ; V_MKQ is only live during + * barrett_reduce_4x, where V_ZERO/zero is reloaded from vs3 anyway). */ +#define V_ZERO 3 + +/* Barrett-multiply constants */ #define V_NMKQ 5 #define V_Z0 7 #define V_Z1 8 #define V_Z2 9 #define V_Z3 10 #define V_ZETA 10 -#define V1441 10 +#define V_NINV 10 +#define V_NINV_TW 11 + +/* Barrett twisted zetas: zt = round_to_even(z * 2^16 / q) / 2. + * Placed in vdata_b slots (free by the time Load_next_4zetas runs) to + * avoid the V20159/V2pw25/V_26/V_MKQ/V_Z0..3 constants required by the + * next Barrett-Q-reduce. V_ZETATW aliases V_ZT0 (broadcast layer 5-7). */ +#define V_ZT0 12 +#define V_ZT1 20 +#define V_ZT2 6 +#define V_ZT3 11 +#define V_ZETATW 12 #define vdata_a1 21 #define vdata_a2 22 @@ -47,25 +61,26 @@ #define vdata_brt3 16 #define vdata_brt4 20 -#define vdata_mont1 25 -#define vdata_mont2 26 -#define vdata_mont3 30 -#define vdata_mont4 31 +#define vdata_t1 25 +#define vdata_t2 26 +#define vdata_t3 30 +#define vdata_t4 31 #define vresult_brt1 4 #define vresult_brt2 9 #define vresult_brt3 13 #define vresult_brt4 17 -#define vresult_mont1 13 -#define vresult_mont2 18 -#define vresult_mont3 23 -#define vresult_mont4 28 +#define vresult_t1 13 +#define vresult_t2 18 +#define vresult_t3 23 +#define vresult_t4 28 #define rinp 3 #define dup_rinp 5 #define qinp 4 #define len_2 7 -#define zeta_inp 14 +#define zeta_inp 14 +#define zeta_tw_inp 22 #define a1_offset 9 #define a2_offset 16 #define a3_offset 18 @@ -86,6 +101,7 @@ std 19, 96(1) std 20, 104(1) std 21, 112(1) + std 22, 120(1) li 10, 128 li 11, 144 li 12, 160 @@ -145,6 +161,7 @@ ld 19, 96(1) ld 20, 104(1) ld 21, 112(1) + ld 22, 120(1) mtlr 0 addi 1, 1, 352 @@ -152,14 +169,14 @@ /* * Compute r[j] and r[j+len] from computed coefficients - * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barett reduce) - * r[j+len] - r[j]: V25, V26, V30, V31 (data for Montgomery reduce) + * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barrett Q-reduce) + * r[j+len] - r[j]: V25, V26, V30, V31 (data for Barrett fqmul) */ .macro Compute_4Coeffs - vsubuhm vdata_mont1, vdata_b1, vdata_a1 - vsubuhm vdata_mont2, vdata_b2, vdata_a2 - vsubuhm vdata_mont3, vdata_b3, vdata_a3 - vsubuhm vdata_mont4, vdata_b4, vdata_a4 + vsubuhm vdata_t1, vdata_b1, vdata_a1 + vsubuhm vdata_t2, vdata_b2, vdata_a2 + vsubuhm vdata_t3, vdata_b3, vdata_a3 + vsubuhm vdata_t4, vdata_b4, vdata_a4 vadduhm vdata_brt1, vdata_b1, vdata_a1 vadduhm vdata_brt2, vdata_b2, vdata_a2 vadduhm vdata_brt3, vdata_b3, vdata_a3 @@ -359,48 +376,30 @@ .endm /* - * ----------------------------------- - * mont_reduce_4x(_vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3) - */ -.macro mont_reduce_4x _vz0, _vz1, _vz2, _vz3, _vo0, _vo1, _vo2, _vo3 - /* Modular multiplication bound by 2^16 * q in abs value */ - vmladduhm 15, vdata_mont1, \_vz0, rinp - vmladduhm 20, vdata_mont2, \_vz1, rinp - vmladduhm 27, vdata_mont3, \_vz2, rinp - vmladduhm 28, vdata_mont4, \_vz3, rinp - - /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_mont1, \_vz0, rinp - vmhraddshs 19, vdata_mont2, \_vz1, rinp - vmhraddshs 24, vdata_mont3, \_vz2, rinp - vmhraddshs 29, vdata_mont4, \_vz3, rinp - - vmladduhm 15, 15, V_QINV, 3 - vmladduhm 20, 20, V_QINV, 3 - vmladduhm 25, 27, V_QINV, 3 - vmladduhm 30, 28, V_QINV, 3 - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - /* Shift right 1 bit */ - vsrah \_vo0, 15, 4 - vsrah \_vo1, 20, 4 - vsrah \_vo2, 25, 4 - vsrah \_vo3, 30, 4 -.endm - -/* - * setup constant vectors for Montgmery multiplication - * V_NMKQ, V_QINV, Zero vector, One vector + * Barrett multiplication for the per-layer fqmul (4 lanes). + * For each lane (b in vdata_t_i, z in _vz_i, zt in _vzt_i): + * t = vmhraddshs(b, zt, 0) = round(b*zt / 2^15) + * b_lo = vmladduhm(b, z, 0) = (b*z) mod 2^16 + * vo = vmladduhm(t, -q, b_lo) = b*z - t*q (mod 2^16) + * + * Yields the signed canonical representative of (b*z) mod q, + * bounded by q/2. V_ZERO is the zero vector set at function entry. */ -.macro Set_mont_consts - xxlor 32+5, 0, 0 /* V_NMKQ */ - xxlor 32+2, 2, 2 /* V_QINV */ - xxlor 32+3, 3, 3 /* all 0 */ - xxlor 32+4, 4, 4 /* all 1 */ +.macro barrett_fqmul_4x _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3, _vo0, _vo1, _vo2, _vo3 + vmhraddshs 14, vdata_t1, \_vzt0, V_ZERO + vmhraddshs 19, vdata_t2, \_vzt1, V_ZERO + vmhraddshs 24, vdata_t3, \_vzt2, V_ZERO + vmhraddshs 29, vdata_t4, \_vzt3, V_ZERO + + vmladduhm \_vo0, vdata_t1, \_vz0, V_ZERO + vmladduhm \_vo1, vdata_t2, \_vz1, V_ZERO + vmladduhm \_vo2, vdata_t3, \_vz2, V_ZERO + vmladduhm \_vo3, vdata_t4, \_vz3, V_ZERO + + vmladduhm \_vo0, 14, V_NMKQ, \_vo0 + vmladduhm \_vo1, 19, V_NMKQ, \_vo1 + vmladduhm \_vo2, 24, V_NMKQ, \_vo2 + vmladduhm \_vo3, 29, V_NMKQ, \_vo3 .endm .macro Load_next_4zetas @@ -411,7 +410,12 @@ lxvd2x 32+V_Z1, 8, zeta_inp lxvd2x 32+V_Z2, 11, zeta_inp lxvd2x 32+V_Z3, 12, zeta_inp + lxvd2x 32+V_ZT0, 0, zeta_tw_inp + lxvd2x 32+V_ZT1, 8, zeta_tw_inp + lxvd2x 32+V_ZT2, 11, zeta_tw_inp + lxvd2x 32+V_ZT3, 12, zeta_tw_inp addi zeta_inp, zeta_inp, 64 + addi zeta_tw_inp, zeta_tw_inp, 64 .endm .macro Write_B4C _vs0, _vs1, _vs2, _vs3 @@ -429,10 +433,10 @@ .endm .macro Reload_4coeffs - lxvd2x 32+vdata_mont1, 0, rinp - lxvd2x 32+vdata_mont2, 10, rinp - lxvd2x 32+vdata_mont3, 11, rinp - lxvd2x 32+vdata_mont4, 12, rinp + lxvd2x 32+vdata_t1, 0, rinp + lxvd2x 32+vdata_t2, 10, rinp + lxvd2x 32+vdata_t3, 11, rinp + lxvd2x 32+vdata_t4, 12, rinp addi rinp, rinp, 64 .endm @@ -458,14 +462,14 @@ xxlor 32+19, 11, 11 xxlor 32+24, 12, 12 xxlor 32+29, 13, 13 - xxpermdi 32+10, 32+14, 32+vresult_mont1, 3 - xxpermdi 32+11, 32+14, 32+vresult_mont1, 0 - xxpermdi 32+12, 32+19, 32+vresult_mont2, 3 - xxpermdi 32+13, 32+19, 32+vresult_mont2, 0 - xxpermdi 32+14, 32+24, 32+vresult_mont3, 3 - xxpermdi 32+15, 32+24, 32+vresult_mont3, 0 - xxpermdi 32+16, 32+29, 32+vresult_mont4, 3 - xxpermdi 32+17, 32+29, 32+vresult_mont4, 0 + xxpermdi 32+10, 32+14, 32+vresult_t1, 3 + xxpermdi 32+11, 32+14, 32+vresult_t1, 0 + xxpermdi 32+12, 32+19, 32+vresult_t2, 3 + xxpermdi 32+13, 32+19, 32+vresult_t2, 0 + xxpermdi 32+14, 32+24, 32+vresult_t3, 3 + xxpermdi 32+15, 32+24, 32+vresult_t3, 0 + xxpermdi 32+16, 32+29, 32+vresult_t4, 3 + xxpermdi 32+17, 32+29, 32+vresult_t4, 0 stxvd2x 32+10, 0, dup_rinp stxvd2x 32+11, 10, dup_rinp stxvd2x 32+12, 11, dup_rinp @@ -485,14 +489,14 @@ xxlor 32+19, 11, 11 xxlor 32+24, 12, 12 xxlor 32+29, 13, 13 - vmrgew 10, vresult_mont1, 14 - vmrgow 11, vresult_mont1, 14 - vmrgew 12, vresult_mont2, 19 - vmrgow 13, vresult_mont2, 19 - vmrgew 14, vresult_mont3, 24 - vmrgow 15, vresult_mont3, 24 - vmrgew 16, vresult_mont4, 29 - vmrgow 17, vresult_mont4, 29 + vmrgew 10, vresult_t1, 14 + vmrgow 11, vresult_t1, 14 + vmrgew 12, vresult_t2, 19 + vmrgow 13, vresult_t2, 19 + vmrgew 14, vresult_t3, 24 + vmrgow 15, vresult_t3, 24 + vmrgew 16, vresult_t4, 29 + vmrgow 17, vresult_t4, 29 stxvd2x 32+10, 0, dup_rinp stxvd2x 32+11, 10, dup_rinp stxvd2x 32+12, 11, dup_rinp @@ -514,9 +518,10 @@ xxlor 11, 32+vresult_brt2, 32+vresult_brt2 xxlor 12, 32+vresult_brt3, 32+vresult_brt3 xxlor 13, 32+vresult_brt4, 32+vresult_brt4 - Set_mont_consts + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ Load_next_4zetas - mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 PermWriteL24 .endm @@ -531,9 +536,10 @@ xxlor 11, 32+vresult_brt2, 32+vresult_brt2 xxlor 12, 32+vresult_brt3, 32+vresult_brt3 xxlor 13, 32+vresult_brt4, 32+vresult_brt4 - Set_mont_consts + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ Load_next_4zetas - mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 PermWriteL44 .endm @@ -544,10 +550,11 @@ Load_4Coeffs \start, \next barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 - Set_mont_consts + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ Load_next_4zetas - mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 - Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + Write_M4C 32+vresult_t1, 32+vresult_t2, 32+vresult_t3, 32+vresult_t4 .endm /* @@ -557,10 +564,12 @@ Load_4Coeffs \start, \next barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 - Set_mont_consts - lvx V_ZETA, 0, 14 - mont_reduce_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA, vresult_mont1, vresult_mont2, vresult_mont3, vresult_mont4 - Write_M4C 32+vresult_mont1, 32+vresult_mont2, 32+vresult_mont3, 32+vresult_mont4 + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + barrett_fqmul_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + Write_M4C 32+vresult_t1, 32+vresult_t2, 32+vresult_t3, 32+vresult_t4 .endm /* @@ -603,17 +612,11 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) SAVE_REGS - /* init vectors and constants - Setup for Montgomery reduce */ - lxvx 0, 0, qinp + /* init vectors and constants */ + lxvx 0, 0, qinp /* -Q */ - li 10, MLK_PPC_QINV_OFFSET - lxvx 32+V_QINV, 10, qinp - xxlxor 32+3, 32+3, 32+3 - vspltish 4, 1 - xxlor 2, 32+2, 32+2 /* QINV */ - xxlor 3, 32+3, 32+3 /* 0 vector */ - xxlor 4, 32+4, 32+4 /* 1 vector */ + xxlxor 32+V_ZERO, 32+V_ZERO, 32+V_ZERO + xxlor 3, 32+V_ZERO, 32+V_ZERO /* save zero vector to vs3 */ /* Setup for Barrett reduce */ li 10, MLK_PPC_Q_OFFSET @@ -639,19 +642,21 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) li 18, 112 /* - * Montgomery reduce loops with constant 1441 + * Scale every coefficient by N^-1 via Barrett multiplication. */ - addi zeta_inp, qinp, MLK_PPC_C1441_OFFSET - lvx V1441, 0, zeta_inp + addi zeta_inp, qinp, MLK_PPC_N_INV_OFFSET + lvx V_NINV, 0, zeta_inp + addi zeta_inp, qinp, MLK_PPC_N_INV_TW_OFFSET + lvx V_NINV_TW, 0, zeta_inp li 8, 4 mtctr 8 - Set_mont_consts + xxlor 32+V_NMKQ, 0, 0 /* V_NMKQ = -Q */ intt_ppc_asm_Loopf: Reload_4coeffs - mont_reduce_4x V1441, V1441, V1441, V1441, 6, 7, 8, 9 + barrett_fqmul_4x V_NINV, V_NINV, V_NINV, V_NINV, V_NINV_TW, V_NINV_TW, V_NINV_TW, V_NINV_TW, 6, 7, 8, 9 Reload_4coeffs - mont_reduce_4x V1441, V1441, V1441, V1441, 13, 18, 23, 28 + barrett_fqmul_4x V_NINV, V_NINV, V_NINV, V_NINV, V_NINV_TW, V_NINV_TW, V_NINV_TW, V_NINV_TW, 13, 18, 23, 28 MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 bdnz intt_ppc_asm_Loopf @@ -667,6 +672,7 @@ intt_ppc_asm_Loopf: * Load zeta vectors in 2-2-2-2 layout */ addi zeta_inp, qinp, MLK_PPC_ZETA_INTT_OFFSET + addi zeta_tw_inp, qinp, MLK_PPC_ZETA_INTT_TW_OFFSET li len_2, 4 /* len * 2 */ mr dup_rinp, rinp @@ -719,11 +725,13 @@ intt_ppc_asm_Loopf: intt_layer34 0, 64 addi zeta_inp, zeta_inp, -64 + addi zeta_tw_inp, zeta_tw_inp, -64 intt_layer34 16, 64 intt_layer34 256, 64 addi zeta_inp, zeta_inp, -64 + addi zeta_tw_inp, zeta_tw_inp, -64 intt_layer34 272, 64 .balign 16 @@ -734,12 +742,16 @@ intt_ppc_asm_Loopf: intt_layer567 0, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 intt_layer567 128, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 intt_layer567 256, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 intt_layer567 384, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 .balign 16 /* @@ -750,9 +762,11 @@ intt_ppc_asm_Loopf: intt_layer567 0, 16 intt_layer567 64, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 intt_layer567 256, 16 intt_layer567 320, 16 addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 .balign 16 /* @@ -774,14 +788,20 @@ intt_ppc_asm_Loopf: #undef V2pw25 #undef V_26 #undef V_MKQ -#undef V_QINV +#undef V_ZERO #undef V_NMKQ #undef V_Z0 #undef V_Z1 #undef V_Z2 #undef V_Z3 #undef V_ZETA -#undef V1441 +#undef V_NINV +#undef V_NINV_TW +#undef V_ZT0 +#undef V_ZT1 +#undef V_ZT2 +#undef V_ZT3 +#undef V_ZETATW #undef vdata_a1 #undef vdata_a2 #undef vdata_a3 @@ -794,23 +814,24 @@ intt_ppc_asm_Loopf: #undef vdata_brt2 #undef vdata_brt3 #undef vdata_brt4 -#undef vdata_mont1 -#undef vdata_mont2 -#undef vdata_mont3 -#undef vdata_mont4 +#undef vdata_t1 +#undef vdata_t2 +#undef vdata_t3 +#undef vdata_t4 #undef vresult_brt1 #undef vresult_brt2 #undef vresult_brt3 #undef vresult_brt4 -#undef vresult_mont1 -#undef vresult_mont2 -#undef vresult_mont3 -#undef vresult_mont4 +#undef vresult_t1 +#undef vresult_t2 +#undef vresult_t3 +#undef vresult_t4 #undef rinp #undef dup_rinp #undef qinp #undef len_2 #undef zeta_inp +#undef zeta_tw_inp #undef a1_offset #undef a2_offset #undef a3_offset diff --git a/dev/ppc64le/src/ntt_ppc_asm.S b/dev/ppc64le/src/ntt_ppc_asm.S index 2e54c8f84f..33bccdb9cb 100644 --- a/dev/ppc64le/src/ntt_ppc_asm.S +++ b/dev/ppc64le/src/ntt_ppc_asm.S @@ -15,13 +15,18 @@ #include "consts.h" -#define V_QINV 2 -#define V_NMKQ 5 -#define V_Z0 7 -#define V_Z1 8 -#define V_Z2 9 -#define V_Z3 10 -#define V_ZETA 10 +#define V_ZERO 3 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V_ZT0 0 +#define V_ZT1 1 +#define V_ZT2 6 +#define V_ZT3 11 +#define V_ZETATW 2 #define vdata_a1 12 #define vdata_a2 17 @@ -45,7 +50,8 @@ #define dup_rinp 5 #define qinp 4 #define len_2 7 -#define zeta_inp 14 +#define zeta_inp 14 +#define zeta_tw_inp 22 #define a1_offset 9 #define a2_offset 16 #define a3_offset 18 @@ -68,6 +74,7 @@ std 19, 96(1) std 20, 104(1) std 21, 112(1) + std 22, 120(1) li 10, 128 li 11, 144 li 12, 160 @@ -127,6 +134,7 @@ ld 19, 96(1) ld 20, 104(1) ld 21, 112(1) + ld 22, 120(1) mtlr 0 addi 1, 1, 352 @@ -256,42 +264,30 @@ .endm /* - * montgomery_reduce - * t = a * QINV - * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * Barrett multiplication (4 lanes). + * For each lane (b in vdata_b_i, z in _vz_i, zt in _vzt_i): + * t = vmhraddshs(b, zt, 0) = round(b*zt / 2^15) + * b_lo = vmladduhm(b, z, 0) = (b*z) mod 2^16 + * vdata_b_i = vmladduhm(t, -q, b_lo) = b*z - t*q (mod 2^16) * - * ----------------------------------- - * mont_reduce_4x(_vz0, _vz1, _vz2, _vz3) + * Yields the signed canonical representative of (b*z) mod q, + * bounded by q/2. */ -.macro mont_reduce_4x _vz0, _vz1, _vz2, _vz3 - /* fqmul = zeta * coefficient - Modular multiplication bound by 2^16 * q in abs value */ - vmladduhm 15, vdata_b1, \_vz0, rinp - vmladduhm 20, vdata_b2, \_vz1, rinp - vmladduhm 25, vdata_b3, \_vz2, rinp - vmladduhm 30, vdata_b4, \_vz3, rinp - - /* Signed multiply-high-round; outputs are bound by 2^15 * q in abs value */ - vmhraddshs 14, vdata_b1, \_vz0, rinp - vmhraddshs 19, vdata_b2, \_vz1, rinp - vmhraddshs 24, vdata_b3, \_vz2, rinp - vmhraddshs 29, vdata_b4, \_vz3, rinp - - vmladduhm 15, 15, V_QINV, rinp - vmladduhm 20, 20, V_QINV, rinp - vmladduhm 25, 25, V_QINV, rinp - vmladduhm 30, 30, V_QINV, rinp - - vmhraddshs 15, 15, V_NMKQ, 14 - vmhraddshs 20, 20, V_NMKQ, 19 - vmhraddshs 25, 25, V_NMKQ, 24 - vmhraddshs 30, 30, V_NMKQ, 29 - - /* Shift right 1 bit */ - vsrah vdata_b1, 15, 4 - vsrah vdata_b2, 20, 4 - vsrah vdata_b3, 25, 4 - vsrah vdata_b4, 30, 4 +.macro barrett_fqmul_4x _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3 + vmhraddshs vresult_a1, vdata_b1, \_vzt0, V_ZERO + vmhraddshs vresult_a2, vdata_b2, \_vzt1, V_ZERO + vmhraddshs vresult_a3, vdata_b3, \_vzt2, V_ZERO + vmhraddshs vresult_a4, vdata_b4, \_vzt3, V_ZERO + + vmladduhm vdata_b1, vdata_b1, \_vz0, V_ZERO + vmladduhm vdata_b2, vdata_b2, \_vz1, V_ZERO + vmladduhm vdata_b3, vdata_b3, \_vz2, V_ZERO + vmladduhm vdata_b4, vdata_b4, \_vz3, V_ZERO + + vmladduhm vdata_b1, vresult_a1, V_NMKQ, vdata_b1 + vmladduhm vdata_b2, vresult_a2, V_NMKQ, vdata_b2 + vmladduhm vdata_b3, vresult_a3, V_NMKQ, vdata_b3 + vmladduhm vdata_b4, vresult_a4, V_NMKQ, vdata_b4 .endm /* @@ -312,8 +308,8 @@ * final r[j]: V15, V20, V25, V30 */ .macro Compute_4Coeffs - /* Since the result of the Montgomery multiplication is bounded - by q in absolute value. + /* Since the result of the Barrett multiplication is bounded + by q/2 in absolute value. Finally to complete the final update of the results with add/sub r[j] = r[j] + t. r[j+len] = r[j] - t @@ -395,7 +391,12 @@ lxvd2x 32+V_Z1, 10, zeta_inp lxvd2x 32+V_Z2, 11, zeta_inp lxvd2x 32+V_Z3, 12, zeta_inp + lxvd2x 32+V_ZT0, 0, zeta_tw_inp + lxvd2x 32+V_ZT1, 10, zeta_tw_inp + lxvd2x 32+V_ZT2, 11, zeta_tw_inp + lxvd2x 32+V_ZT3, 12, zeta_tw_inp addi zeta_inp, zeta_inp, 64 + addi zeta_tw_inp, zeta_tw_inp, 64 .endm /* @@ -404,7 +405,7 @@ .macro ntt_layer7 Load_next_4zetas Load_L24Coeffs - mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3 + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 PermWriteL24 addi dup_rinp, dup_rinp, 128 .endm @@ -415,7 +416,7 @@ .macro ntt_layer6 Load_next_4zetas Load_L44Coeffs - mont_reduce_4x V_Z0, V_Z1, V_Z2, V_Z3 + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 PermWriteL44 addi dup_rinp, dup_rinp, 128 .endm @@ -423,9 +424,9 @@ /* * NTT other layers, 1, 2, 3, 4, 5. */ -.macro ntt_layer12345 start, next, _vz0, _vz1, _vz2, _vz3 +.macro ntt_layer12345 start, next, _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3 Load_4Coeffs \start, \next - mont_reduce_4x \_vz0, \_vz1, \_vz2, \_vz3 + barrett_fqmul_4x \_vz0, \_vz1, \_vz2, \_vz3, \_vzt0, \_vzt1, \_vzt2, \_vzt3 Load_4Rj Compute_4Coeffs Write_One @@ -475,14 +476,11 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) /* load -MLKEM_Q */ lvx V_NMKQ,0,qinp - /* Register 14 as pointer to zetas array */ + /* zeta_inp: regular Barrett roots; zeta_tw_inp: twisted roots */ addi zeta_inp, qinp, MLK_PPC_ZETA_NTT_OFFSET + addi zeta_tw_inp, qinp, MLK_PPC_ZETA_NTT_TW_OFFSET - vxor 3, 3, 3 - vspltish 4, 1 - - li 10, MLK_PPC_QINV_OFFSET - lvx V_QINV, 10, qinp + vxor V_ZERO, V_ZERO, V_ZERO .balign 16 /* @@ -490,12 +488,14 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 256 /* len * 2 */ lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 - ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - ntt_layer12345 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW .balign 16 /* @@ -503,14 +503,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 128 lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA - ntt_layer12345 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW .balign 16 /* @@ -518,20 +522,28 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 64 lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp addi zeta_inp, zeta_inp, 16 - ntt_layer12345 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW .balign 16 /* @@ -539,12 +551,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 32 Load_next_4zetas - ntt_layer12345 0, 64, V_Z0, V_Z1, V_Z2, V_Z3 - ntt_layer12345 16, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 0, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + ntt_layer12345 16, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 Load_next_4zetas - ntt_layer12345 256, 64, V_Z0, V_Z1, V_Z2, V_Z3 - ntt_layer12345 272, 64, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 256, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + ntt_layer12345 272, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 .balign 16 /* @@ -552,16 +564,16 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) */ li len_2, 16 Load_next_4zetas - ntt_layer12345 0, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 0, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 Load_next_4zetas - ntt_layer12345 128, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 128, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 Load_next_4zetas - ntt_layer12345 256, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 256, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 Load_next_4zetas - ntt_layer12345 384, 32, V_Z0, V_Z1, V_Z2, V_Z3 + ntt_layer12345 384, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 /* * Layer 6. len = 4, @@ -608,13 +620,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef V_QINV +#undef V_ZERO #undef V_NMKQ #undef V_Z0 #undef V_Z1 #undef V_Z2 #undef V_Z3 #undef V_ZETA +#undef V_ZT0 +#undef V_ZT1 +#undef V_ZT2 +#undef V_ZT3 +#undef V_ZETATW #undef vdata_a1 #undef vdata_a2 #undef vdata_a3 @@ -636,6 +653,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) #undef qinp #undef len_2 #undef zeta_inp +#undef zeta_tw_inp #undef a1_offset #undef a2_offset #undef a3_offset diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 1aa1a1236e..c3323d0122 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -680,13 +680,16 @@ /* mlkem/src/native/ppc64le/src/consts.h */ #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H #undef MLK_PPC_C1353_OFFSET -#undef MLK_PPC_C1441_OFFSET #undef MLK_PPC_C20159_OFFSET #undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_N_INV_OFFSET +#undef MLK_PPC_N_INV_TW_OFFSET #undef MLK_PPC_QINV_OFFSET #undef MLK_PPC_Q_OFFSET #undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_INTT_TW_OFFSET #undef MLK_PPC_ZETA_NTT_OFFSET +#undef MLK_PPC_ZETA_NTT_TW_OFFSET #undef mlk_ppc_qdata #endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index 9fddfc4da1..b0ebd4b96a 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -704,13 +704,16 @@ /* mlkem/src/native/ppc64le/src/consts.h */ #undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H #undef MLK_PPC_C1353_OFFSET -#undef MLK_PPC_C1441_OFFSET #undef MLK_PPC_C20159_OFFSET #undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_N_INV_OFFSET +#undef MLK_PPC_N_INV_TW_OFFSET #undef MLK_PPC_QINV_OFFSET #undef MLK_PPC_Q_OFFSET #undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_INTT_TW_OFFSET #undef MLK_PPC_ZETA_NTT_OFFSET +#undef MLK_PPC_ZETA_NTT_TW_OFFSET #undef mlk_ppc_qdata #endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 74c8aa441c..4e4a3dde72 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -49,15 +49,28 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { 20159, 20159, 20159, - /* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */ - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, - 1441, + /* N^-1 in Montgomery form: pow(128,-1,MLKEM_Q) * 2^16 mod MLKEM_Q = 512. + * Multiplying by this via Barrett-fqmul scales INTT output by N^-1 and + * leaves it in Montgomery form (mlk_poly_invntt_tomont contract). */ + 512, + 512, + 512, + 512, + 512, + 512, + 512, + 512, + /* check-magic: 5040 == round((512 * 2**16 + MLKEM_Q) / MLKEM_Q) // 2 */ + /* Barrett twist of N^-1*R = round_to_even(N_INV_MONT * 2^16 / MLKEM_Q) / 2 + */ + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ 1353, 1353, @@ -71,6 +84,10 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { #include "consts_ntt.inc" /* zetas for invNTT */ #include "consts_intt.inc" +/* twisted zetas for NTT (Barrett high-mul) */ +#include "consts_ntt_tw.inc" +/* twisted zetas for invNTT (Barrett high-mul) */ +#include "consts_intt_tw.inc" }; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index de04ea2191..e72b954cf9 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -13,10 +13,13 @@ #define MLK_PPC_QINV_OFFSET 16 #define MLK_PPC_Q_OFFSET 32 #define MLK_PPC_C20159_OFFSET 48 -#define MLK_PPC_C1441_OFFSET 64 -#define MLK_PPC_C1353_OFFSET 80 -#define MLK_PPC_ZETA_NTT_OFFSET 96 -#define MLK_PPC_ZETA_INTT_OFFSET 1104 +#define MLK_PPC_N_INV_OFFSET 64 +#define MLK_PPC_N_INV_TW_OFFSET 80 +#define MLK_PPC_C1353_OFFSET 96 +#define MLK_PPC_ZETA_NTT_OFFSET 112 +#define MLK_PPC_ZETA_INTT_OFFSET 1120 +#define MLK_PPC_ZETA_NTT_TW_OFFSET 2128 +#define MLK_PPC_ZETA_INTT_TW_OFFSET 3136 /* check-magic: on */ #ifndef __ASSEMBLER__ diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc index 0e1dd4367e..057b1df249 100644 --- a/mlkem/src/native/ppc64le/src/consts_intt.inc +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -12,66 +12,66 @@ /* Twiddle factors for the PPC64LE inverse NTT. * See autogen for details. */ - -1460, -1460, 1628, 1628, 958, 958, 1522, 1522, - -308, -308, 991, 991, -108, -108, 996, 996, - -854, -854, 478, 478, -1510, -1510, -870, -870, - -1530, -1530, 794, 794, -1185, -1185, -1278, -1278, - 220, 220, -1659, -1659, -874, -874, -1187, -1187, - -136, -136, -1335, -1335, -1215, -1215, 1218, 1218, - -1285, -1285, 384, 384, 1322, 1322, -1465, -1465, - 1097, 1097, 610, 610, 817, 817, 603, 603, - 329, 329, -75, -75, 418, 418, -156, -156, - 644, 644, 349, 349, -1590, -1590, -872, -872, - 1483, 1483, 1119, 1119, -777, -777, -602, -602, - 778, 778, -147, -147, -246, -246, 1159, 1159, - -460, -460, 1653, 1653, -291, -291, 1574, 1574, - 587, 587, -235, -235, 422, 422, 177, 177, - 871, 871, 105, 105, -1251, -1251, 1550, 1550, - 430, 430, 843, 843, -1103, -1103, 555, 555, - 677, 677, 677, 677, -1275, -1275, -1275, -1275, - 448, 448, 448, 448, -1065, -1065, -1065, -1065, - -1508, -1508, -1508, -1508, -725, -725, -725, -725, - -398, -398, -398, -398, 961, 961, 961, 961, - -247, -247, -247, -247, -951, -951, -951, -951, - 107, 107, 107, 107, -1421, -1421, -1421, -1421, - -271, -271, -271, -271, 830, 830, 830, 830, - -853, -853, -853, -853, -90, -90, -90, -90, - 126, 126, 126, 126, 1469, 1469, 1469, 1469, - -1618, -1618, -1618, -1618, -1162, -1162, -1162, -1162, - -320, -320, -320, -320, -666, -666, -666, -666, - 516, 516, 516, 516, -8, -8, -8, -8, - -282, -282, -282, -282, -1544, -1544, -1544, -1544, - -1293, -1293, -1293, -1293, 1491, 1491, 1491, 1491, - -552, -552, -552, -552, 1015, 1015, 1015, 1015, - 1223, 1223, 1223, 1223, 652, 652, 652, 652, - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, - -205, -205, -205, -205, -205, -205, -205, -205, - 411, 411, 411, 411, 411, 411, 411, 411, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - 608, 608, 608, 608, 608, 608, 608, 608, - 732, 732, 732, 732, 732, 732, 732, 732, - 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, - -681, -681, -681, -681, -681, -681, -681, -681, - -130, -130, -130, -130, -130, -130, -130, -130, - -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, - 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - -829, -829, -829, -829, -829, -829, -829, -829, - 383, 383, 383, 383, 383, 383, 383, 383, - 264, 264, 264, 264, 264, 264, 264, 264, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, - 573, 573, 573, 573, 573, 573, 573, 573, - 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, - -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - 962, 962, 962, 962, 962, 962, 962, 962, - 182, 182, 182, 182, 182, 182, 182, 182, - 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, - 622, 622, 622, 622, 622, 622, 622, 622, - -171, -171, -171, -171, -171, -171, -171, -171, - 202, 202, 202, 202, 202, 202, 202, 202, - 287, 287, 287, 287, 287, 287, 287, 287, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - -359, -359, -359, -359, -359, -359, -359, -359, - -758, -758, -758, -758, -758, -758, -758, -758, + -394, -394, -1175, -1175, -1219, -1219, 885, 885, + 1212, 1212, 1029, 1029, -1607, -1607, -1455, -1455, + -1179, -1179, 886, 886, 1143, 1143, -554, -554, + 1092, 1092, 1026, 1026, -525, -525, 403, 403, + 561, 561, -735, -735, -1230, -1230, -863, -863, + 319, 319, 757, 757, 1063, 1063, -556, -556, + -780, -780, 1645, 1645, 375, 375, -1239, -1239, + -1031, -1031, -109, -109, 1584, 1584, -1292, -1292, + -992, -992, 641, 641, 733, 733, 268, 268, + -1021, -1021, -941, -941, 939, 939, -892, -892, + 952, 952, -642, -642, -1482, -1482, 1461, 1461, + 1651, 1651, -1540, -1540, -1626, -1626, -540, -540, + -1173, -1173, -279, -279, 756, 756, -314, -314, + -667, -667, 233, 233, 1409, 1409, -48, -48, + 723, 723, 1100, 1100, 1637, 1637, -1041, -1041, + -568, -568, -680, -680, 17, 17, 583, 583, + 1227, 1227, 1227, 1227, 910, 910, 910, 910, + -855, -855, -855, -855, -219, -219, -219, -219, + 1481, 1481, 1481, 1481, 648, 648, 648, 648, + -682, -682, -682, -682, -712, -712, -712, -712, + 1534, 1534, 1534, 1534, -927, -927, -927, -927, + 1438, 1438, 1438, 1438, -461, -461, -461, -461, + 807, 807, 807, 807, 452, 452, 452, 452, + -1010, -1010, -1010, -1010, 1435, 1435, 1435, 1435, + 1320, 1320, 1320, 1320, -1414, -1414, -1414, -1414, + -464, -464, -464, -464, 33, 33, 33, 33, + -816, -816, -816, -816, 632, 632, 632, 632, + 650, 650, 650, 650, -1352, -1352, -1352, -1352, + -1052, -1052, -1052, -1052, -1274, -1274, -1274, -1274, + 1197, 1197, 1197, 1197, -1025, -1025, -1025, -1025, + -76, -76, -76, -76, -1573, -1573, -1573, -1573, + 289, 289, 289, 289, 331, 331, 331, 331, + 821, 821, 821, 821, 821, 821, 821, 821, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + -450, -450, -450, -450, -450, -450, -450, -450, + -936, -936, -936, -936, -936, -936, -936, -936, + -447, -447, -447, -447, -447, -447, -447, -447, + 535, 535, 535, 535, 535, 535, 535, 535, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 56, 56, 56, 56, 56, 56, 56, 56, + -283, -283, -283, -283, -283, -283, -283, -283, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + -882, -882, -882, -882, -882, -882, -882, -882, + 296, 296, 296, 296, 296, 296, 296, 296, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 569, 569, 569, 569, 569, 569, 569, 569, + -69, -69, -69, -69, -69, -69, -69, -69, + -543, -543, -543, -543, -543, -543, -543, -543, + 797, 797, 797, 797, 797, 797, 797, 797, + 193, 193, 193, 193, 193, 193, 193, 193, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + 848, 848, 848, 848, 848, 848, 848, 848, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 630, 630, 630, 630, 630, 630, 630, 630, + -687, -687, -687, -687, -687, -687, -687, -687, + -40, -40, -40, -40, -40, -40, -40, -40, + -749, -749, -749, -749, -749, -749, -749, -749, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, diff --git a/mlkem/src/native/ppc64le/src/consts_intt_tw.inc b/mlkem/src/native/ppc64le/src/consts_intt_tw.inc new file mode 100644 index 0000000000..783e0af8f2 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -3878, -3878, -11566, -11566, -11999, -11999, 8711, 8711, + 11930, 11930, 10129, 10129, -15818, -15818, -14322, -14322, + -11605, -11605, 8721, 8721, 11251, 11251, -5453, -5453, + 10749, 10749, 10099, 10099, -5168, -5168, 3967, 3967, + 5522, 5522, -7235, -7235, -12107, -12107, -8495, -8495, + 3140, 3140, 7451, 7451, 10463, 10463, -5473, -5473, + -7678, -7678, 16192, 16192, 3691, 3691, -12196, -12196, + -10148, -10148, -1073, -1073, 15592, 15592, -12717, -12717, + -9764, -9764, 6309, 6309, 7215, 7215, 2638, 2638, + -10050, -10050, -9262, -9262, 9243, 9243, -8780, -8780, + 9371, 9371, -6319, -6319, -14588, -14588, 14381, 14381, + 16251, 16251, -15159, -15159, -16005, -16005, -5315, -5315, + -11546, -11546, -2746, -2746, 7441, 7441, -3091, -3091, + -6565, -6565, 2293, 2293, 13869, 13869, -472, -472, + 7117, 7117, 10828, 10828, 16113, 16113, -10247, -10247, + -5591, -5591, -6693, -6693, 167, 167, 5739, 5739, + 12078, 12078, 12078, 12078, 8957, 8957, 8957, 8957, + -8416, -8416, -8416, -8416, -2156, -2156, -2156, -2156, + 14578, 14578, 14578, 14578, 6378, 6378, 6378, 6378, + -6713, -6713, -6713, -6713, -7008, -7008, -7008, -7008, + 15099, 15099, 15099, 15099, -9125, -9125, -9125, -9125, + 14155, 14155, 14155, 14155, -4538, -4538, -4538, -4538, + 7943, 7943, 7943, 7943, 4449, 4449, 4449, 4449, + -9942, -9942, -9942, -9942, 14125, 14125, 14125, 14125, + 12993, 12993, 12993, 12993, -13918, -13918, -13918, -13918, + -4567, -4567, -4567, -4567, 325, 325, 325, 325, + -8032, -8032, -8032, -8032, 6221, 6221, 6221, 6221, + 6398, 6398, 6398, 6398, -13308, -13308, -13308, -13308, + -10355, -10355, -10355, -10355, -12540, -12540, -12540, -12540, + 11782, 11782, 11782, 11782, -10089, -10089, -10089, -10089, + -748, -748, -748, -748, -15483, -15483, -15483, -15483, + 2845, 2845, 2845, 2845, 3258, 3258, 3258, 3258, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 551, 551, 551, 551, 551, 551, 551, 551, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -679, -679, -679, -679, -679, -679, -679, -679, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + -394, -394, -394, -394, -394, -394, -394, -394, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc index 2729155aab..e53bf13713 100644 --- a/mlkem/src/native/ppc64le/src/consts_ntt.inc +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -12,66 +12,66 @@ /* Twiddle factors for the PPC64LE forward NTT. * See autogen for details. */ - -758, -758, -758, -758, -758, -758, -758, -758, - -359, -359, -359, -359, -359, -359, -359, -359, - -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, - 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, - 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, - 287, 287, 287, 287, 287, 287, 287, 287, - 202, 202, 202, 202, 202, 202, 202, 202, - -171, -171, -171, -171, -171, -171, -171, -171, - 622, 622, 622, 622, 622, 622, 622, 622, - 1577, 1577, 1577, 1577, 1577, 1577, 1577, 1577, - 182, 182, 182, 182, 182, 182, 182, 182, - 962, 962, 962, 962, 962, 962, 962, 962, - -1202, -1202, -1202, -1202, -1202, -1202, -1202, -1202, - -1474, -1474, -1474, -1474, -1474, -1474, -1474, -1474, - 1468, 1468, 1468, 1468, 1468, 1468, 1468, 1468, - 573, 573, 573, 573, 573, 573, 573, 573, - -1325, -1325, -1325, -1325, -1325, -1325, -1325, -1325, - 264, 264, 264, 264, 264, 264, 264, 264, - 383, 383, 383, 383, 383, 383, 383, 383, - -829, -829, -829, -829, -829, -829, -829, -829, - 1458, 1458, 1458, 1458, 1458, 1458, 1458, 1458, - -1602, -1602, -1602, -1602, -1602, -1602, -1602, -1602, - -130, -130, -130, -130, -130, -130, -130, -130, - -681, -681, -681, -681, -681, -681, -681, -681, - 1017, 1017, 1017, 1017, 1017, 1017, 1017, 1017, - 732, 732, 732, 732, 732, 732, 732, 732, - 608, 608, 608, 608, 608, 608, 608, 608, - -1542, -1542, -1542, -1542, -1542, -1542, -1542, -1542, - 411, 411, 411, 411, 411, 411, 411, 411, - -205, -205, -205, -205, -205, -205, -205, -205, - -1571, -1571, -1571, -1571, -1571, -1571, -1571, -1571, - 652, 652, 652, 652, 1223, 1223, 1223, 1223, - 1015, 1015, 1015, 1015, -552, -552, -552, -552, - 1491, 1491, 1491, 1491, -1293, -1293, -1293, -1293, - -1544, -1544, -1544, -1544, -282, -282, -282, -282, - -8, -8, -8, -8, 516, 516, 516, 516, - -666, -666, -666, -666, -320, -320, -320, -320, - -1162, -1162, -1162, -1162, -1618, -1618, -1618, -1618, - 1469, 1469, 1469, 1469, 126, 126, 126, 126, - -90, -90, -90, -90, -853, -853, -853, -853, - 830, 830, 830, 830, -271, -271, -271, -271, - -1421, -1421, -1421, -1421, 107, 107, 107, 107, - -951, -951, -951, -951, -247, -247, -247, -247, - 961, 961, 961, 961, -398, -398, -398, -398, - -725, -725, -725, -725, -1508, -1508, -1508, -1508, - -1065, -1065, -1065, -1065, 448, 448, 448, 448, - -1275, -1275, -1275, -1275, 677, 677, 677, 677, - 555, 555, -1103, -1103, 843, 843, 430, 430, - 1550, 1550, -1251, -1251, 105, 105, 871, 871, - 177, 177, 422, 422, -235, -235, 587, 587, - 1574, 1574, -291, -291, 1653, 1653, -460, -460, - 1159, 1159, -246, -246, -147, -147, 778, 778, - -602, -602, -777, -777, 1119, 1119, 1483, 1483, - -872, -872, -1590, -1590, 349, 349, 644, 644, - -156, -156, 418, 418, -75, -75, 329, 329, - 603, 603, 817, 817, 610, 610, 1097, 1097, - -1465, -1465, 1322, 1322, 384, 384, -1285, -1285, - 1218, 1218, -1215, -1215, -1335, -1335, -136, -136, - -1187, -1187, -874, -874, -1659, -1659, 220, 220, - -1278, -1278, -1185, -1185, 794, 794, -1530, -1530, - -870, -870, -1510, -1510, 478, 478, -854, -854, - 996, 996, -108, -108, 991, 991, -308, -308, - 1522, 1522, 958, 958, 1628, 1628, -1460, -1460, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, + -749, -749, -749, -749, -749, -749, -749, -749, + -40, -40, -40, -40, -40, -40, -40, -40, + -687, -687, -687, -687, -687, -687, -687, -687, + 630, 630, 630, 630, 630, 630, 630, 630, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 848, 848, 848, 848, 848, 848, 848, 848, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 193, 193, 193, 193, 193, 193, 193, 193, + 797, 797, 797, 797, 797, 797, 797, 797, + -543, -543, -543, -543, -543, -543, -543, -543, + -69, -69, -69, -69, -69, -69, -69, -69, + 569, 569, 569, 569, 569, 569, 569, 569, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 296, 296, 296, 296, 296, 296, 296, 296, + -882, -882, -882, -882, -882, -882, -882, -882, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + -283, -283, -283, -283, -283, -283, -283, -283, + 56, 56, 56, 56, 56, 56, 56, 56, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 535, 535, 535, 535, 535, 535, 535, 535, + -447, -447, -447, -447, -447, -447, -447, -447, + -936, -936, -936, -936, -936, -936, -936, -936, + -450, -450, -450, -450, -450, -450, -450, -450, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + 821, 821, 821, 821, 821, 821, 821, 821, + 331, 331, 331, 331, 289, 289, 289, 289, + -1573, -1573, -1573, -1573, -76, -76, -76, -76, + -1025, -1025, -1025, -1025, 1197, 1197, 1197, 1197, + -1274, -1274, -1274, -1274, -1052, -1052, -1052, -1052, + -1352, -1352, -1352, -1352, 650, 650, 650, 650, + 632, 632, 632, 632, -816, -816, -816, -816, + 33, 33, 33, 33, -464, -464, -464, -464, + -1414, -1414, -1414, -1414, 1320, 1320, 1320, 1320, + 1435, 1435, 1435, 1435, -1010, -1010, -1010, -1010, + 452, 452, 452, 452, 807, 807, 807, 807, + -461, -461, -461, -461, 1438, 1438, 1438, 1438, + -927, -927, -927, -927, 1534, 1534, 1534, 1534, + -712, -712, -712, -712, -682, -682, -682, -682, + 648, 648, 648, 648, 1481, 1481, 1481, 1481, + -219, -219, -219, -219, -855, -855, -855, -855, + 910, 910, 910, 910, 1227, 1227, 1227, 1227, + 583, 583, 17, 17, -680, -680, -568, -568, + -1041, -1041, 1637, 1637, 1100, 1100, 723, 723, + -48, -48, 1409, 1409, 233, 233, -667, -667, + -314, -314, 756, 756, -279, -279, -1173, -1173, + -540, -540, -1626, -1626, -1540, -1540, 1651, 1651, + 1461, 1461, -1482, -1482, -642, -642, 952, 952, + -892, -892, 939, 939, -941, -941, -1021, -1021, + 268, 268, 733, 733, 641, 641, -992, -992, + -1292, -1292, 1584, 1584, -109, -109, -1031, -1031, + -1239, -1239, 375, 375, 1645, 1645, -780, -780, + -556, -556, 1063, 1063, 757, 757, 319, 319, + -863, -863, -1230, -1230, -735, -735, 561, 561, + 403, 403, -525, -525, 1026, 1026, 1092, 1092, + -554, -554, 1143, 1143, 886, 886, -1179, -1179, + -1455, -1455, -1607, -1607, 1029, 1029, 1212, 1212, + 885, 885, -1219, -1219, -1175, -1175, -394, -394, diff --git a/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc b/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc new file mode 100644 index 0000000000..a191b3bf2f --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -394, -394, -394, -394, -394, -394, -394, -394, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + -679, -679, -679, -679, -679, -679, -679, -679, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 551, 551, 551, 551, 551, 551, 551, 551, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + 3258, 3258, 3258, 3258, 2845, 2845, 2845, 2845, + -15483, -15483, -15483, -15483, -748, -748, -748, -748, + -10089, -10089, -10089, -10089, 11782, 11782, 11782, 11782, + -12540, -12540, -12540, -12540, -10355, -10355, -10355, -10355, + -13308, -13308, -13308, -13308, 6398, 6398, 6398, 6398, + 6221, 6221, 6221, 6221, -8032, -8032, -8032, -8032, + 325, 325, 325, 325, -4567, -4567, -4567, -4567, + -13918, -13918, -13918, -13918, 12993, 12993, 12993, 12993, + 14125, 14125, 14125, 14125, -9942, -9942, -9942, -9942, + 4449, 4449, 4449, 4449, 7943, 7943, 7943, 7943, + -4538, -4538, -4538, -4538, 14155, 14155, 14155, 14155, + -9125, -9125, -9125, -9125, 15099, 15099, 15099, 15099, + -7008, -7008, -7008, -7008, -6713, -6713, -6713, -6713, + 6378, 6378, 6378, 6378, 14578, 14578, 14578, 14578, + -2156, -2156, -2156, -2156, -8416, -8416, -8416, -8416, + 8957, 8957, 8957, 8957, 12078, 12078, 12078, 12078, + 5739, 5739, 167, 167, -6693, -6693, -5591, -5591, + -10247, -10247, 16113, 16113, 10828, 10828, 7117, 7117, + -472, -472, 13869, 13869, 2293, 2293, -6565, -6565, + -3091, -3091, 7441, 7441, -2746, -2746, -11546, -11546, + -5315, -5315, -16005, -16005, -15159, -15159, 16251, 16251, + 14381, 14381, -14588, -14588, -6319, -6319, 9371, 9371, + -8780, -8780, 9243, 9243, -9262, -9262, -10050, -10050, + 2638, 2638, 7215, 7215, 6309, 6309, -9764, -9764, + -12717, -12717, 15592, 15592, -1073, -1073, -10148, -10148, + -12196, -12196, 3691, 3691, 16192, 16192, -7678, -7678, + -5473, -5473, 10463, 10463, 7451, 7451, 3140, 3140, + -8495, -8495, -12107, -12107, -7235, -7235, 5522, 5522, + 3967, 3967, -5168, -5168, 10099, 10099, 10749, 10749, + -5453, -5453, 11251, 11251, 8721, 8721, -11605, -11605, + -14322, -14322, -15818, -15818, 10129, 10129, 11930, 11930, + 8711, 8711, -11999, -11999, -11566, -11566, -3878, -3878, diff --git a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S index ddd6d97b87..958cfdb354 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -32,6 +32,7 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) std 19, 96(1) std 20, 104(1) std 21, 112(1) + std 22, 120(1) li 10, 128 li 11, 144 li 12, 160 @@ -57,13 +58,8 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) stxvx 62, 15, 1 stxvx 63, 16, 1 lxvx 0, 0, 4 - li 10, 16 - lxvx 34, 10, 4 xxlxor 35, 35, 35 - vspltish 4, 1 - xxlor 2, 34, 34 xxlor 3, 35, 35 - xxlor 4, 36, 36 li 10, 32 li 11, 48 lxvx 6, 10, 4 @@ -84,12 +80,11 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) li 18, 112 addi 14, 4, 64 lvx 10, 0, 14 + addi 14, 4, 80 + lvx 11, 0, 14 li 8, 4 mtctr 8 xxlor 37, 0, 0 - xxlor 34, 2, 2 - xxlor 35, 3, 3 - xxlor 36, 4, 4 intt_ppc_asm_Loopf: lxvd2x 57, 0, 3 @@ -97,51 +92,35 @@ intt_ppc_asm_Loopf: lxvd2x 62, 11, 3 lxvd2x 63, 12, 3 addi 3, 3, 64 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 - vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 6, 15, 4 - vsrah 7, 20, 4 - vsrah 8, 25, 4 - vsrah 9, 30, 4 + vmhraddshs 14, 25, 11, 3 + vmhraddshs 19, 26, 11, 3 + vmhraddshs 24, 30, 11, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 6, 25, 10, 3 + vmladduhm 7, 26, 10, 3 + vmladduhm 8, 30, 10, 3 + vmladduhm 9, 31, 10, 3 + vmladduhm 6, 14, 5, 6 + vmladduhm 7, 19, 5, 7 + vmladduhm 8, 24, 5, 8 + vmladduhm 9, 29, 5, 9 lxvd2x 57, 0, 3 lxvd2x 58, 10, 3 lxvd2x 62, 11, 3 lxvd2x 63, 12, 3 addi 3, 3, 64 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + vmhraddshs 14, 25, 11, 3 + vmhraddshs 19, 26, 11, 3 + vmhraddshs 24, 30, 11, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 addi 3, 3, -128 stxvd2x 38, 0, 3 stxvd2x 39, 10, 3 @@ -156,7 +135,9 @@ intt_ppc_asm_Loopf: addi 3, 3, -512 nop nop - addi 14, 4, 1104 + nop + addi 14, 4, 1120 + addi 22, 4, 3136 li 7, 4 mr 5, 3 lxvd2x 57, 0, 5 @@ -236,9 +217,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -246,27 +225,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -365,9 +341,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -375,27 +349,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -494,9 +465,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -504,27 +473,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -623,9 +589,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -633,27 +597,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -675,7 +636,6 @@ intt_ppc_asm_Loopf: stxvd2x 48, 17, 5 stxvd2x 49, 18, 5 addi 5, 5, 128 - nop mr 5, 3 li 7, 8 lxvd2x 10, 0, 5 @@ -755,9 +715,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -765,27 +723,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -884,9 +839,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -894,27 +847,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -1013,9 +963,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1023,27 +971,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -1142,9 +1087,7 @@ intt_ppc_asm_Loopf: xxlor 12, 45, 45 xxlor 13, 49, 49 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1152,27 +1095,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 xxlor 46, 10, 10 xxlor 51, 11, 11 xxlor 56, 12, 12 @@ -1274,9 +1214,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1284,27 +1222,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -1386,9 +1321,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1396,27 +1329,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -1498,9 +1428,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1508,27 +1436,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -1610,9 +1535,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1620,27 +1543,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -1726,9 +1646,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1736,32 +1654,30 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, -64 + addi 22, 22, -64 li 9, 16 add 10, 7, 9 addi 16, 9, 64 @@ -1839,9 +1755,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1849,27 +1763,24 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -1951,9 +1862,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -1961,32 +1870,30 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, -64 + addi 22, 22, -64 li 9, 272 add 10, 7, 9 addi 16, 9, 64 @@ -2064,9 +1971,7 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 li 8, 16 li 11, 32 li 12, 48 @@ -2074,32 +1979,31 @@ intt_ppc_asm_Loopf: lxvd2x 40, 8, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 - vmladduhm 15, 25, 7, 3 - vmladduhm 20, 26, 8, 3 - vmladduhm 27, 30, 9, 3 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 7, 3 - vmhraddshs 19, 26, 8, 3 - vmhraddshs 24, 30, 9, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 nop + nop + nop li 7, 64 li 9, 0 add 10, 7, 9 @@ -2178,35 +2082,27 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 li 9, 128 add 10, 7, 9 addi 16, 9, 16 @@ -2284,35 +2180,27 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 li 9, 256 add 10, 7, 9 addi 16, 9, 16 @@ -2390,35 +2278,27 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 li 9, 384 add 10, 7, 9 addi 16, 9, 16 @@ -2496,35 +2376,27 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 nop nop nop @@ -2606,30 +2478,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -2711,35 +2574,27 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 li 9, 256 add 10, 7, 9 addi 16, 9, 16 @@ -2817,30 +2672,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -2922,35 +2768,29 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 stxvd2x 60, 3, 21 addi 14, 14, 16 + addi 22, 22, 16 + nop + nop nop li 7, 256 li 9, 0 @@ -3030,30 +2870,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -3135,30 +2966,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -3240,30 +3062,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -3345,30 +3158,21 @@ intt_ppc_asm_Loopf: stxvd2x 45, 3, 18 stxvd2x 49, 3, 20 xxlor 37, 0, 0 - xxlor 34, 2, 2 xxlor 35, 3, 3 - xxlor 36, 4, 4 lvx 10, 0, 14 - vmladduhm 15, 25, 10, 3 - vmladduhm 20, 26, 10, 3 - vmladduhm 27, 30, 10, 3 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 vmladduhm 28, 31, 10, 3 - vmhraddshs 14, 25, 10, 3 - vmhraddshs 19, 26, 10, 3 - vmhraddshs 24, 30, 10, 3 - vmhraddshs 29, 31, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 27, 2, 3 - vmladduhm 30, 28, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 stxvd2x 45, 3, 10 stxvd2x 50, 3, 17 stxvd2x 55, 3, 19 @@ -3405,6 +3209,7 @@ intt_ppc_asm_Loopf: ld 19, 96(1) ld 20, 104(1) ld 21, 112(1) + ld 22, 120(1) mtlr 0 addi 1, 1, 352 blr diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S index 31c4595357..bc3a86db8a 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -32,6 +32,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) std 19, 96(1) std 20, 104(1) std 21, 112(1) + std 22, 120(1) li 10, 128 li 11, 144 li 12, 160 @@ -57,14 +58,15 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvx 62, 15, 1 stxvx 63, 16, 1 lvx 5, 0, 4 - addi 14, 4, 96 + addi 14, 4, 112 + addi 22, 4, 2128 vxor 3, 3, 3 - vspltish 4, 1 - li 10, 16 - lvx 2, 10, 4 + nop li 7, 256 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 0 add 10, 7, 9 addi 16, 9, 16 @@ -77,26 +79,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -129,26 +123,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -181,26 +167,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -233,26 +211,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -274,9 +244,13 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 nop + nop + nop li 7, 128 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 0 add 10, 7, 9 addi 16, 9, 16 @@ -289,26 +263,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -341,26 +307,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -382,7 +340,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 256 add 10, 7, 9 addi 16, 9, 16 @@ -395,26 +355,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -447,26 +399,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -492,7 +436,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) nop li 7, 64 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 0 add 10, 7, 9 addi 16, 9, 16 @@ -505,26 +451,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -546,7 +484,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 128 add 10, 7, 9 addi 16, 9, 16 @@ -559,26 +499,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -600,7 +532,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 256 add 10, 7, 9 addi 16, 9, 16 @@ -613,26 +547,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -654,7 +580,9 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 lvx 10, 0, 14 + lvx 2, 0, 22 addi 14, 14, 16 + addi 22, 22, 16 li 9, 384 add 10, 7, 9 addi 16, 9, 16 @@ -667,26 +595,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 10, 3 - vmladduhm 20, 18, 10, 3 - vmladduhm 25, 23, 10, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 10, 3 - vmhraddshs 19, 18, 10, 3 - vmhraddshs 24, 23, 10, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -718,7 +638,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 0 add 10, 7, 9 addi 16, 9, 64 @@ -731,26 +656,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -783,26 +700,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -830,7 +739,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 256 add 10, 7, 9 addi 16, 9, 64 @@ -843,26 +757,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -895,26 +801,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -936,8 +834,6 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 nop - nop - nop li 7, 16 li 10, 16 li 11, 32 @@ -946,7 +842,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 0 add 10, 7, 9 addi 16, 9, 32 @@ -959,26 +860,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -1006,7 +899,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 128 add 10, 7, 9 addi 16, 9, 32 @@ -1019,26 +917,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -1066,7 +956,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 256 add 10, 7, 9 addi 16, 9, 32 @@ -1079,26 +974,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -1126,7 +1013,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 li 9, 384 add 10, 7, 9 addi 16, 9, 32 @@ -1139,26 +1031,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 50, 3, 17 lxvd2x 55, 3, 19 lxvd2x 60, 3, 21 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 lxvd2x 44, 3, 9 lxvd2x 49, 3, 16 lxvd2x 54, 3, 18 @@ -1197,7 +1081,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 xxmrgld 45, 2, 1 @@ -1214,26 +1103,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 4, 18, 5 xxmrgld 60, 4, 3 xxmrghd 59, 4, 3 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1266,7 +1147,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 xxmrgld 45, 2, 1 @@ -1283,26 +1169,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 4, 18, 5 xxmrgld 60, 4, 3 xxmrghd 59, 4, 3 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1335,7 +1213,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 xxmrgld 45, 2, 1 @@ -1352,26 +1235,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 4, 18, 5 xxmrgld 60, 4, 3 xxmrghd 59, 4, 3 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1404,7 +1279,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 1, 0, 5 lxvd2x 2, 10, 5 xxmrgld 45, 2, 1 @@ -1421,26 +1301,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 4, 18, 5 xxmrgld 60, 4, 3 xxmrghd 59, 4, 3 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1477,7 +1349,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 57, 0, 5 lxvd2x 58, 10, 5 vmrgew 13, 25, 26 @@ -1494,26 +1371,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 58, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1546,7 +1415,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 57, 0, 5 lxvd2x 58, 10, 5 vmrgew 13, 25, 26 @@ -1563,26 +1437,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 58, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1615,7 +1481,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 57, 0, 5 lxvd2x 58, 10, 5 vmrgew 13, 25, 26 @@ -1632,26 +1503,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 58, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1684,7 +1547,12 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 40, 10, 14 lxvd2x 41, 11, 14 lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 addi 14, 14, 64 + addi 22, 22, 64 lxvd2x 57, 0, 5 lxvd2x 58, 10, 5 vmrgew 13, 25, 26 @@ -1701,26 +1569,18 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) lxvd2x 58, 18, 5 vmrgew 28, 25, 26 vmrgow 27, 25, 26 - vmladduhm 15, 13, 7, 3 - vmladduhm 20, 18, 8, 3 - vmladduhm 25, 23, 9, 3 - vmladduhm 30, 28, 10, 3 - vmhraddshs 14, 13, 7, 3 - vmhraddshs 19, 18, 8, 3 - vmhraddshs 24, 23, 9, 3 - vmhraddshs 29, 28, 10, 3 - vmladduhm 15, 15, 2, 3 - vmladduhm 20, 20, 2, 3 - vmladduhm 25, 25, 2, 3 - vmladduhm 30, 30, 2, 3 - vmhraddshs 15, 15, 5, 14 - vmhraddshs 20, 20, 5, 19 - vmhraddshs 25, 25, 5, 24 - vmhraddshs 30, 30, 5, 29 - vsrah 13, 15, 4 - vsrah 18, 20, 4 - vsrah 23, 25, 4 - vsrah 28, 30, 4 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 vsubuhm 16, 12, 13 vadduhm 15, 13, 12 vsubuhm 21, 17, 18 @@ -1778,6 +1638,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) ld 19, 96(1) ld 20, 104(1) ld 21, 112(1) + ld 22, 120(1) mtlr 0 addi 1, 1, 352 blr diff --git a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S index 6739a61505..e4157fd1d4 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -56,7 +56,7 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) stxvx 62, 9, 1 li 6, 0 li 7, 16 - li 8, 80 + li 8, 96 lxvx 37, 6, 4 lxvx 34, 7, 4 lxvx 32, 8, 4 diff --git a/scripts/autogen b/scripts/autogen index ff9e1f3f82..7b77f88e1a 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2054,8 +2054,15 @@ def gen_riscv64_zeta_files(): _PPC64LE_LEN2_PERM = [2, 0, 3, 1] -def gen_ppc64le_ntt_zetas(): - z = list(gen_c_zetas()) +def gen_c_real_zetas(): + # Same bit-reversed ordering as `gen_c_zetas`, but without the + # Montgomery factor (raw `z = root^idx mod q`). + zeta = [signed_reduce(pow(root_of_unity, i, modulus)) for i in range(128)] + yield from (zeta[bitreverse(i, 7)] for i in range(128)) + + +def _gen_ppc64le_ntt_zeta_layout(transform): + z = [transform(t) for t in gen_c_real_zetas()] # Layers Len=128, 64, 32, 16, 8: broadcast x 8. for i in range(1, 32): @@ -2072,8 +2079,8 @@ def gen_ppc64le_ntt_zetas(): yield from [src[p]] * 2 -def gen_ppc64le_intt_zetas(): - z = list(gen_c_zetas()) +def _gen_ppc64le_intt_zeta_layout(transform): + z = [transform(t) for t in gen_c_real_zetas()] # Layer Len=2: per group of 4 reverse-ordered zetas, same permutation # and duplication as the NTT. @@ -2092,6 +2099,30 @@ def gen_ppc64le_intt_zetas(): yield from [z[31 - i]] * 8 +def _ppc64le_zeta_id(z): + return signed_reduce(z) + + +def _ppc64le_zeta_twist(z): + return prepare_root_for_barrett(z)[1] + + +def gen_ppc64le_ntt_zetas(): + yield from _gen_ppc64le_ntt_zeta_layout(_ppc64le_zeta_id) + + +def gen_ppc64le_intt_zetas(): + yield from _gen_ppc64le_intt_zeta_layout(_ppc64le_zeta_id) + + +def gen_ppc64le_ntt_twist_zetas(): + yield from _gen_ppc64le_ntt_zeta_layout(_ppc64le_zeta_twist) + + +def gen_ppc64le_intt_twist_zetas(): + yield from _gen_ppc64le_intt_zeta_layout(_ppc64le_zeta_twist) + + def gen_ppc64le_zeta_files(): """Generate PPC64LE zeta include files.""" @@ -2116,20 +2147,30 @@ def gen_ppc64le_zeta_files(): "/* Twiddle factors for the PPC64LE inverse NTT.\n * See autogen for details.\n */", ) ) + ntt_tw_content = "\n".join( + gen_inc( + gen_ppc64le_ntt_twist_zetas(), + "/* Twisted twiddle factors for the PPC64LE forward NTT.\n * See autogen for details.\n */", + ) + ) + intt_tw_content = "\n".join( + gen_inc( + gen_ppc64le_intt_twist_zetas(), + "/* Twisted twiddle factors for the PPC64LE inverse NTT.\n * See autogen for details.\n */", + ) + ) # The .inc files are #include'd by `consts.c` (not by an .S file), so they # are not inlined via simpasm; we therefore write them directly into both # the developer tree and the mlkem mirror. - for path in ( - "dev/ppc64le/src/consts_ntt.inc", - "mlkem/src/native/ppc64le/src/consts_ntt.inc", - ): - update_file(path, ntt_content) - for path in ( - "dev/ppc64le/src/consts_intt.inc", - "mlkem/src/native/ppc64le/src/consts_intt.inc", + for filename, content in ( + ("consts_ntt.inc", ntt_content), + ("consts_intt.inc", intt_content), + ("consts_ntt_tw.inc", ntt_tw_content), + ("consts_intt_tw.inc", intt_tw_content), ): - update_file(path, intt_content) + for tree in ("dev/ppc64le/src", "mlkem/src/native/ppc64le/src"): + update_file(f"{tree}/{filename}", content) def get_c_source_files(main_only=False, core_only=False, strip_mlkem=False): From 6899543a2fb43fe9c83331fc0161fb8436d62984 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Wed, 20 May 2026 10:04:14 +0200 Subject: [PATCH 23/27] ppc64le: address review comments from mkannwischer Signed-off-by: Basil Hess --- .github/actions/multi-functest/action.yml | 31 +- .github/workflows/ci.yml | 8 +- BIBLIOGRAPHY.md | 1 - dev/ppc64le/src/consts.c | 4 +- dev/ppc64le/src/intt_ppc_asm.S | 4 +- dev/ppc64le/src/ntt_ppc_asm.S | 8 +- dev/ppc64le/src/poly_tomont_ppc_asm.S | 5 +- dev/ppc64le/src/reduce_ppc_asm.S | 5 +- flake.nix | 2 +- integration/liboqs/ML-KEM-1024_META.yml | 19 -- integration/liboqs/ML-KEM-512_META.yml | 19 -- integration/liboqs/ML-KEM-768_META.yml | 19 -- integration/liboqs/config_ppc64le.h | 267 ------------------ mlkem/src/native/ppc64le/src/consts.c | 4 +- mlkem/src/native/ppc64le/src/intt_ppc_asm.S | 70 +++-- mlkem/src/native/ppc64le/src/ntt_ppc_asm.S | 66 +++-- .../native/ppc64le/src/poly_tomont_ppc_asm.S | 55 ++-- mlkem/src/native/ppc64le/src/reduce_ppc_asm.S | 29 +- test/mk/components.mk | 3 +- 19 files changed, 149 insertions(+), 470 deletions(-) delete mode 100644 integration/liboqs/config_ppc64le.h diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 3e3837a7f6..dc72b25f25 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -147,7 +147,7 @@ runs: rng_fail: ${{ inputs.rng_fail }} extra_args: ${{ inputs.extra_args }} extra_env: ${{ inputs.extra_env }} - - name: Cross ppc64le Tests + - name: Cross ppc64le Tests (POWER8) if: ${{ (inputs.compile_mode == 'all' || inputs.compile_mode == 'cross-ppc64le') && (success() || failure()) }} uses: ./.github/actions/functest with: @@ -156,10 +156,35 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE" + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mcpu=power8" ldflags: ${{ inputs.ldflags }} cross_prefix: powerpc64le-unknown-linux-gnu- - exec_wrapper: qemu-ppc64le + exec_wrapper: "qemu-ppc64le -cpu power8" + opt: ${{ inputs.opt }} + func: ${{ inputs.func }} + kat: ${{ inputs.kat }} + unit: ${{ inputs.unit }} + acvp: ${{ inputs.acvp }} + wycheproof: ${{ inputs.wycheproof }} + examples: ${{ inputs.examples }} + check_namespace: ${{ inputs.check_namespace }} + stack: ${{ inputs.stack }} + alloc: ${{ inputs.alloc }} + rng_fail: ${{ inputs.rng_fail }} + extra_args: ${{ inputs.extra_args }} + - name: Cross ppc64le Tests (POWER7) + if: ${{ (inputs.compile_mode == 'all' || inputs.compile_mode == 'cross-ppc64le') && (success() || failure()) }} + uses: ./.github/actions/functest + with: + nix-shell: ${{ inputs.nix-shell }} + nix-cache: ${{ inputs.nix-cache }} + nix-verbose: ${{ inputs.nix-verbose }} + gh_token: ${{ inputs.gh_token }} + custom_shell: ${{ inputs.custom_shell }} + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mcpu=power7" + ldflags: ${{ inputs.ldflags }} + cross_prefix: powerpc64le-unknown-linux-gnu- + exec_wrapper: "qemu-ppc64le -cpu power7" opt: ${{ inputs.opt }} func: ${{ inputs.func }} kat: ${{ inputs.kat }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 28732278f5..13555e5ae2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,8 +154,8 @@ jobs: check_namespace: 'false' - name: build + test (cross, opt) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -164,8 +164,8 @@ jobs: opt: 'opt' - name: build + test (cross, opt, +debug) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index 3f2751394c..231c5e5d26 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -72,7 +72,6 @@ source code and documentation. - [examples/multilevel_build_native/mlkem_native/mlkem_native_config.h](examples/multilevel_build_native/mlkem_native/mlkem_native_config.h) - [integration/liboqs/config_aarch64.h](integration/liboqs/config_aarch64.h) - [integration/liboqs/config_c.h](integration/liboqs/config_c.h) - - [integration/liboqs/config_ppc64le.h](integration/liboqs/config_ppc64le.h) - [integration/liboqs/config_x86_64.h](integration/liboqs/config_x86_64.h) - [mlkem/mlkem_native_config.h](mlkem/mlkem_native_config.h) - [mlkem/src/kem.c](mlkem/src/kem.c) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 4e4a3dde72..6f3f71eff4 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -6,7 +6,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) #include "consts.h" @@ -90,4 +90,4 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { #include "consts_intt_tw.inc" }; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/intt_ppc_asm.S b/dev/ppc64le/src/intt_ppc_asm.S index bd92cdad50..f0eaa12de5 100644 --- a/dev/ppc64le/src/intt_ppc_asm.S +++ b/dev/ppc64le/src/intt_ppc_asm.S @@ -1,10 +1,8 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - * =================================================================================== * Written by Danny Tsen */ diff --git a/dev/ppc64le/src/ntt_ppc_asm.S b/dev/ppc64le/src/ntt_ppc_asm.S index 33bccdb9cb..c28783881c 100644 --- a/dev/ppc64le/src/ntt_ppc_asm.S +++ b/dev/ppc64le/src/ntt_ppc_asm.S @@ -1,10 +1,8 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - * =================================================================================== * Written by Danny Tsen */ @@ -270,8 +268,8 @@ * b_lo = vmladduhm(b, z, 0) = (b*z) mod 2^16 * vdata_b_i = vmladduhm(t, -q, b_lo) = b*z - t*q (mod 2^16) * - * Yields the signed canonical representative of (b*z) mod q, - * bounded by q/2. + * Computes (b*z) mod q in signed representation; the output + * is in the range (-q, q). */ .macro barrett_fqmul_4x _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3 vmhraddshs vresult_a1, vdata_b1, \_vzt0, V_ZERO diff --git a/dev/ppc64le/src/poly_tomont_ppc_asm.S b/dev/ppc64le/src/poly_tomont_ppc_asm.S index fdf1306924..21a1829eb6 100644 --- a/dev/ppc64le/src/poly_tomont_ppc_asm.S +++ b/dev/ppc64le/src/poly_tomont_ppc_asm.S @@ -1,12 +1,9 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - *=================================================================================== * Written by Danny Tsen - * */ /* diff --git a/dev/ppc64le/src/reduce_ppc_asm.S b/dev/ppc64le/src/reduce_ppc_asm.S index 4650310f3e..e689fdeeec 100644 --- a/dev/ppc64le/src/reduce_ppc_asm.S +++ b/dev/ppc64le/src/reduce_ppc_asm.S @@ -1,12 +1,9 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - *=================================================================================== * Written by Danny Tsen - * */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ diff --git a/flake.nix b/flake.nix index f022fbc15f..47316eec91 100644 --- a/flake.nix +++ b/flake.nix @@ -145,7 +145,7 @@ # autogen shell with cross compiler for the "other" architecture devShells.cross-autogen = util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; } + packages = builtins.attrValues { inherit (config.packages) linters toolchain_ppc64le; inherit (pkgs) gcc-arm-embedded; } ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ] ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ]; }; diff --git a/integration/liboqs/ML-KEM-1024_META.yml b/integration/liboqs/ML-KEM-1024_META.yml index 9c7fe672ab..7d8e50d4c6 100644 --- a/integration/liboqs/ML-KEM-1024_META.yml +++ b/integration/liboqs/ML-KEM-1024_META.yml @@ -89,22 +89,3 @@ implementations: - Darwin required_flags: - asimd -- name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=1024 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux diff --git a/integration/liboqs/ML-KEM-512_META.yml b/integration/liboqs/ML-KEM-512_META.yml index f46dbfdbf1..aa88537d3f 100644 --- a/integration/liboqs/ML-KEM-512_META.yml +++ b/integration/liboqs/ML-KEM-512_META.yml @@ -89,22 +89,3 @@ implementations: - Darwin required_flags: - asimd -- name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=512 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux diff --git a/integration/liboqs/ML-KEM-768_META.yml b/integration/liboqs/ML-KEM-768_META.yml index 28dceb229d..254d67478a 100644 --- a/integration/liboqs/ML-KEM-768_META.yml +++ b/integration/liboqs/ML-KEM-768_META.yml @@ -89,22 +89,3 @@ implementations: - Darwin required_flags: - asimd -- name: ppc64le - version: FIPS203 - folder_name: . - compile_opts: -DMLK_CONFIG_PARAMETER_SET=768 -DMLK_CONFIG_FILE="../../integration/liboqs/config_ppc64le.h" - signature_keypair: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair - signature_keypair_derand: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_keypair_derand - signature_enc: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_enc - signature_dec: PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE_dec - sources: integration/liboqs/config_ppc64le.h integration/liboqs/fips202_glue.h integration/liboqs/fips202x4_glue.h - mlkem/src/cbmc.h mlkem/src/common.h mlkem/src/compress.c mlkem/src/compress.h - mlkem/src/debug.c mlkem/src/debug.h mlkem/src/indcpa.c mlkem/src/indcpa.h mlkem/src/kem.c - mlkem/src/kem.h mlkem/src/native/api.h mlkem/src/native/meta.h mlkem/src/params.h - mlkem/src/poly.c mlkem/src/poly.h mlkem/src/poly_k.c mlkem/src/poly_k.h mlkem/src/randombytes.h - mlkem/src/sampling.c mlkem/src/sampling.h mlkem/src/symmetric.h mlkem/src/sys.h - mlkem/src/verify.c mlkem/src/verify.h mlkem/src/zetas.inc mlkem/src/native/ppc64le - supported_platforms: - - architecture: ppc64le - operating_systems: - - Linux diff --git a/integration/liboqs/config_ppc64le.h b/integration/liboqs/config_ppc64le.h deleted file mode 100644 index 4e8da63047..0000000000 --- a/integration/liboqs/config_ppc64le.h +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (c) The mlkem-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [FIPS140_3_IG] - * Implementation Guidance for FIPS 140-3 and the Cryptographic Module - * Validation Program - * National Institute of Standards and Technology - * https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements - */ - -#ifndef MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H -#define MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H - -/****************************************************************************** - * Name: MLK_CONFIG_PARAMETER_SET - * - * Description: Specifies the parameter set for ML-KEM - * - MLK_CONFIG_PARAMETER_SET=512 corresponds to ML-KEM-512 - * - MLK_CONFIG_PARAMETER_SET=768 corresponds to ML-KEM-768 - * - MLK_CONFIG_PARAMETER_SET=1024 corresponds to ML-KEM-1024 - * - * This can also be set using CFLAGS. - * - *****************************************************************************/ -#ifndef MLK_CONFIG_PARAMETER_SET -#define MLK_CONFIG_PARAMETER_SET \ - 768 /* Change this for different security strengths */ -#endif - -/****************************************************************************** - * Name: MLK_CONFIG_NAMESPACE_PREFIX - * - * Description: The prefix to use to namespace global symbols from mlkem/. - * - * In a multi-level build (that is, if either - * - MLK_CONFIG_MULTILEVEL_WITH_SHARED, or - * - MLK_CONFIG_MULTILEVEL_NO_SHARED, - * are set, level-dependent symbols will additionally be prefixed - * with the parameter set (512/768/1024). - * - * This can also be set using CFLAGS. - * - *****************************************************************************/ -#if MLK_CONFIG_PARAMETER_SET == 512 -#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM512_PPC64LE -#elif MLK_CONFIG_PARAMETER_SET == 768 -#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM768_PPC64LE -#elif MLK_CONFIG_PARAMETER_SET == 1024 -#define MLK_CONFIG_NAMESPACE_PREFIX PQCP_MLKEM_NATIVE_MLKEM1024_PPC64LE -#endif - -/****************************************************************************** - * Name: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH - * - * Description: Determines whether an native arithmetic backend should be used. - * - * The arithmetic backend covers performance critical functions - * such as the number-theoretic transform (NTT). - * - * If this option is unset, the C backend will be used. - * - * If this option is set, the arithmetic backend to be use is - * determined by MLK_CONFIG_ARITH_BACKEND_FILE: If the latter is - * unset, the default backend for your the target architecture - * will be used. If set, it must be the name of a backend metadata - * file. - * - * This can also be set using CFLAGS. - * - *****************************************************************************/ -#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH - -/****************************************************************************** - * Name: MLK_CONFIG_ARITH_BACKEND_FILE - * - * Description: The arithmetic backend to use. - * - * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is unset, this option - * is ignored. - * - * If MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, this option must - * either be undefined or the filename of an arithmetic backend. - * If unset, the default backend will be used. - * - * This can be set using CFLAGS. - * - *****************************************************************************/ -#define MLK_CONFIG_ARITH_BACKEND_FILE "native/meta.h" - -/****************************************************************************** - * Name: MLK_CONFIG_FIPS202_CUSTOM_HEADER - * - * Description: Custom header to use for FIPS-202 - * - * This should only be set if you intend to use a custom - * FIPS-202 implementation, different from the one shipped - * with mlkem-native. - * - * If set, it must be the name of a file serving as the - * replacement for mlkem/fips202/fips202.h, and exposing - * the same API (see FIPS202.md). - * - *****************************************************************************/ -/* -#define MLK_CONFIG_FIPS202_CUSTOM_HEADER \ - "../../integration/liboqs/fips202_glue.h" -*/ - -/****************************************************************************** - * Name: MLK_CONFIG_FIPS202X4_CUSTOM_HEADER - * - * Description: Custom header to use for FIPS-202-X4 - * - * This should only be set if you intend to use a custom - * FIPS-202 implementation, different from the one shipped - * with mlkem-native. - * - * If set, it must be the name of a file serving as the - * replacement for mlkem/fips202/fips202x4.h, and exposing - * the same API (see FIPS202.md). - * - *****************************************************************************/ -/* -#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER \ - "../../integration/liboqs/fips202x4_glue.h" -*/ - -/****************************************************************************** - * Name: MLK_CONFIG_CUSTOM_ZEROIZE - * - * Description: In compliance with FIPS 203 Section 3.3, mlkem-native zeroizes - * intermediate stack buffers before returning from function calls. - * - * Set this option and define `mlk_zeroize` if you want to - * use a custom method to zeroize intermediate stack buffers. - * The default implementation uses SecureZeroMemory on Windows - * and a memset + compiler barrier otherwise. If neither of those - * is available on the target platform, compilation will fail, - * and you will need to use MLK_CONFIG_CUSTOM_ZEROIZE to provide - * a custom implementation of `mlk_zeroize()`. - * - * WARNING: - * The explicit stack zeroization conducted by mlkem-native - * reduces the likelihood of data leaking on the stack, but - * does not eliminate it! The C standard makes no guarantee about - * where a compiler allocates structures and whether/where it makes - * copies of them. Also, in addition to entire structures, there - * may also be potentially exploitable leakage of individual values - * on the stack. - * - * If you need bullet-proof zeroization of the stack, you need to - * consider additional measures instead of of what this feature - * provides. In this case, you can set mlk_zeroize to a no-op. - * - *****************************************************************************/ -/* #define MLK_CONFIG_CUSTOM_ZEROIZE - #if !defined(__ASSEMBLER__) - #include - #include "sys.h" - static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) - { - ... your implementation ... - } - #endif -*/ - -/****************************************************************************** - * Name: MLK_CONFIG_CUSTOM_RANDOMBYTES - * - * Description: mlkem-native does not provide a secure randombytes - * implementation. Such an implementation has to provided by the - * consumer. - * - * If this option is not set, mlkem-native expects a function - * void randombytes(uint8_t *out, size_t outlen). - * - * Set this option and define `mlk_randombytes` if you want to - * use a custom method to sample randombytes with a different name - * or signature. - * - *****************************************************************************/ -#define MLK_CONFIG_CUSTOM_RANDOMBYTES -#if !defined(__ASSEMBLER__) -#include -#include -#include "../../mlkem/src/sys.h" -static MLK_INLINE void mlk_randombytes(uint8_t *ptr, size_t len) -{ - OQS_randombytes(ptr, len); -} -#endif /* !__ASSEMBLER__ */ - -/****************************************************************************** - * Name: MLK_CONFIG_NO_ASM - * - * Description: If this option is set, mlkem-native will be built without - * use of native code or inline assembly. - * - * By default, inline assembly is used to implement value barriers. - * Without inline assembly, mlkem-native will use a global volatile - * 'opt blocker' instead; see verify.h. - * - * Inline assembly is also used to implement a secure zeroization - * function on non-Windows platforms. If this option is set and - * the target platform is not Windows, you MUST set - * MLK_CONFIG_CUSTOM_ZEROIZE and provide a custom zeroization - * function. - * - * If this option is set, MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 and - * and MLK_CONFIG_USE_NATIVE_BACKEND_ARITH will be ignored, and no - *native backends will be used. - * - *****************************************************************************/ -/* #define MLK_CONFIG_NO_ASM */ - -/****************************************************************************** - * Name: MLK_CONFIG_KEYGEN_PCT - * - * Description: Compliance with @[FIPS140_3_IG, p.87] requires a - * Pairwise Consistency Test (PCT) to be carried out on a freshly - * generated keypair before it can be exported. - * - * Set this option if such a check should be implemented. - * In this case, crypto_kem_keypair_derand and crypto_kem_keypair - * will return a non-zero error code if the PCT failed. - * - * NOTE: This feature will drastically lower the performance of - * key generation. - * - *****************************************************************************/ -/* #define MLK_CONFIG_KEYGEN_PCT */ - -/****************************************************************************** - * Name: MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST - * - * Description: If this option is set, the user must provide a runtime - * function `static inline int mlk_break_pct() { ... }` to - * indicate whether the PCT should be made fail. - * - * This option only has an effect if MLK_CONFIG_KEYGEN_PCT is set. - * - *****************************************************************************/ -/* #define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST - #if !defined(__ASSEMBLER__) - #include "sys.h" - static MLK_INLINE int mlk_break_pct(void) - { - ... return 0/1 depending on whether PCT should be broken ... - } - #endif -*/ - -/* Enable valgrind-based assertions in mlkem-native through macro - * from libOQS. */ -#if !defined(__ASSEMBLER__) -#include -#if defined(OQS_ENABLE_TEST_CONSTANT_TIME) -#define MLK_CONFIG_CT_TESTING_ENABLED -#endif -#endif /* !__ASSEMBLER__ */ - -#endif /* !MLK_INTEGRATION_LIBOQS_CONFIG_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 4e4a3dde72..6f3f71eff4 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -6,7 +6,7 @@ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ - !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) #include "consts.h" @@ -90,4 +90,4 @@ MLK_ALIGN const int16_t mlk_ppc_qdata[] = { #include "consts_intt_tw.inc" }; #endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ - !MLK_CONFIG_MULTILEVEL_NO_SHARED */ + !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ */ diff --git a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S index 958cfdb354..f4572b2855 100644 --- a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -1,10 +1,8 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - * =================================================================================== * Written by Danny Tsen */ @@ -39,31 +37,31 @@ MLK_ASM_FN_SYMBOL(intt_ppc_asm) li 14, 176 li 15, 192 li 16, 208 - stxvx 52, 10, 1 - stxvx 53, 11, 1 - stxvx 54, 12, 1 - stxvx 55, 14, 1 - stxvx 56, 15, 1 - stxvx 57, 16, 1 + stxvd2x 52, 10, 1 + stxvd2x 53, 11, 1 + stxvd2x 54, 12, 1 + stxvd2x 55, 14, 1 + stxvd2x 56, 15, 1 + stxvd2x 57, 16, 1 li 10, 224 li 11, 240 li 12, 256 li 14, 272 li 15, 288 li 16, 304 - stxvx 58, 10, 1 - stxvx 59, 11, 1 - stxvx 60, 12, 1 - stxvx 61, 14, 1 - stxvx 62, 15, 1 - stxvx 63, 16, 1 - lxvx 0, 0, 4 + stxvd2x 58, 10, 1 + stxvd2x 59, 11, 1 + stxvd2x 60, 12, 1 + stxvd2x 61, 14, 1 + stxvd2x 62, 15, 1 + stxvd2x 63, 16, 1 + lxvd2x 0, 0, 4 xxlxor 35, 35, 35 xxlor 3, 35, 35 li 10, 32 li 11, 48 - lxvx 6, 10, 4 - lxvx 32, 11, 4 + lxvd2x 6, 10, 4 + lxvd2x 32, 11, 4 vspltisw 8, 13 vadduwm 8, 8, 8 xxlor 8, 40, 40 @@ -135,7 +133,7 @@ intt_ppc_asm_Loopf: addi 3, 3, -512 nop nop - nop + ori 2, 2, 0 addi 14, 4, 1120 addi 22, 4, 3136 li 7, 4 @@ -1135,7 +1133,7 @@ intt_ppc_asm_Loopf: stxvd2x 49, 18, 5 addi 5, 5, 128 nop - nop + ori 2, 2, 0 li 7, 16 li 9, 0 add 10, 7, 9 @@ -1567,7 +1565,7 @@ intt_ppc_asm_Loopf: stxvd2x 60, 3, 21 nop nop - nop + ori 2, 2, 0 li 7, 32 li 9, 0 add 10, 7, 9 @@ -2003,7 +2001,7 @@ intt_ppc_asm_Loopf: stxvd2x 60, 3, 21 nop nop - nop + ori 2, 2, 0 li 7, 64 li 9, 0 add 10, 7, 9 @@ -2399,7 +2397,7 @@ intt_ppc_asm_Loopf: addi 22, 22, 16 nop nop - nop + ori 2, 2, 0 li 7, 128 li 9, 0 add 10, 7, 9 @@ -2791,7 +2789,7 @@ intt_ppc_asm_Loopf: addi 22, 22, 16 nop nop - nop + ori 2, 2, 0 li 7, 256 li 9, 0 add 10, 7, 9 @@ -3183,24 +3181,24 @@ intt_ppc_asm_Loopf: li 14, 176 li 15, 192 li 16, 208 - lxvx 52, 10, 1 - lxvx 53, 11, 1 - lxvx 54, 12, 1 - lxvx 55, 14, 1 - lxvx 56, 15, 1 - lxvx 57, 16, 1 + lxvd2x 52, 10, 1 + lxvd2x 53, 11, 1 + lxvd2x 54, 12, 1 + lxvd2x 55, 14, 1 + lxvd2x 56, 15, 1 + lxvd2x 57, 16, 1 li 10, 224 li 11, 240 li 12, 256 li 14, 272 li 15, 288 li 16, 304 - lxvx 58, 10, 1 - lxvx 59, 11, 1 - lxvx 60, 12, 1 - lxvx 61, 14, 1 - lxvx 62, 15, 1 - lxvx 63, 16, 1 + lxvd2x 58, 10, 1 + lxvd2x 59, 11, 1 + lxvd2x 60, 12, 1 + lxvd2x 61, 14, 1 + lxvd2x 62, 15, 1 + lxvd2x 63, 16, 1 ld 14, 56(1) ld 15, 64(1) ld 16, 72(1) diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S index bc3a86db8a..6a99943b86 100644 --- a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -1,10 +1,8 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - * =================================================================================== * Written by Danny Tsen */ @@ -39,29 +37,29 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 14, 176 li 15, 192 li 16, 208 - stxvx 52, 10, 1 - stxvx 53, 11, 1 - stxvx 54, 12, 1 - stxvx 55, 14, 1 - stxvx 56, 15, 1 - stxvx 57, 16, 1 + stxvd2x 52, 10, 1 + stxvd2x 53, 11, 1 + stxvd2x 54, 12, 1 + stxvd2x 55, 14, 1 + stxvd2x 56, 15, 1 + stxvd2x 57, 16, 1 li 10, 224 li 11, 240 li 12, 256 li 14, 272 li 15, 288 li 16, 304 - stxvx 58, 10, 1 - stxvx 59, 11, 1 - stxvx 60, 12, 1 - stxvx 61, 14, 1 - stxvx 62, 15, 1 - stxvx 63, 16, 1 + stxvd2x 58, 10, 1 + stxvd2x 59, 11, 1 + stxvd2x 60, 12, 1 + stxvd2x 61, 14, 1 + stxvd2x 62, 15, 1 + stxvd2x 63, 16, 1 lvx 5, 0, 4 addi 14, 4, 112 addi 22, 4, 2128 vxor 3, 3, 3 - nop + ori 2, 2, 0 li 7, 256 lvx 10, 0, 14 lvx 2, 0, 22 @@ -245,7 +243,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 63, 3, 21 nop nop - nop + ori 2, 2, 0 li 7, 128 lvx 10, 0, 14 lvx 2, 0, 22 @@ -433,7 +431,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 63, 3, 21 nop nop - nop + ori 2, 2, 0 li 7, 64 lvx 10, 0, 14 lvx 2, 0, 22 @@ -629,7 +627,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 63, 3, 21 nop nop - nop + ori 2, 2, 0 li 7, 32 li 10, 16 li 11, 32 @@ -833,7 +831,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) stxvd2x 58, 3, 19 stxvd2x 62, 3, 20 stxvd2x 63, 3, 21 - nop + ori 2, 2, 0 li 7, 16 li 10, 16 li 11, 32 @@ -1073,7 +1071,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 17, 96 li 18, 112 nop - nop + ori 2, 2, 0 li 10, 16 li 11, 32 li 12, 48 @@ -1341,7 +1339,7 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) mr 5, 3 li 7, 4 nop - nop + ori 2, 2, 0 li 10, 16 li 11, 32 li 12, 48 @@ -1612,24 +1610,24 @@ MLK_ASM_FN_SYMBOL(ntt_ppc_asm) li 14, 176 li 15, 192 li 16, 208 - lxvx 52, 10, 1 - lxvx 53, 11, 1 - lxvx 54, 12, 1 - lxvx 55, 14, 1 - lxvx 56, 15, 1 - lxvx 57, 16, 1 + lxvd2x 52, 10, 1 + lxvd2x 53, 11, 1 + lxvd2x 54, 12, 1 + lxvd2x 55, 14, 1 + lxvd2x 56, 15, 1 + lxvd2x 57, 16, 1 li 10, 224 li 11, 240 li 12, 256 li 14, 272 li 15, 288 li 16, 304 - lxvx 58, 10, 1 - lxvx 59, 11, 1 - lxvx 60, 12, 1 - lxvx 61, 14, 1 - lxvx 62, 15, 1 - lxvx 63, 16, 1 + lxvd2x 58, 10, 1 + lxvd2x 59, 11, 1 + lxvd2x 60, 12, 1 + lxvd2x 61, 14, 1 + lxvd2x 62, 15, 1 + lxvd2x 63, 16, 1 ld 14, 56(1) ld 15, 64(1) ld 16, 72(1) diff --git a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S index e4157fd1d4..170012c3bf 100644 --- a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -1,12 +1,9 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - *=================================================================================== * Written by Danny Tsen - * */ /* @@ -39,27 +36,27 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) li 10, 192 li 11, 208 li 12, 224 - stxvx 52, 6, 1 - stxvx 53, 7, 1 - stxvx 54, 8, 1 - stxvx 55, 9, 1 - stxvx 56, 10, 1 - stxvx 57, 11, 1 - stxvx 58, 12, 1 + stxvd2x 52, 6, 1 + stxvd2x 53, 7, 1 + stxvd2x 54, 8, 1 + stxvd2x 55, 9, 1 + stxvd2x 56, 10, 1 + stxvd2x 57, 11, 1 + stxvd2x 58, 12, 1 li 6, 240 li 7, 256 li 8, 272 li 9, 288 - stxvx 59, 6, 1 - stxvx 60, 7, 1 - stxvx 61, 8, 1 - stxvx 62, 9, 1 + stxvd2x 59, 6, 1 + stxvd2x 60, 7, 1 + stxvd2x 61, 8, 1 + stxvd2x 62, 9, 1 li 6, 0 li 7, 16 li 8, 96 - lxvx 37, 6, 4 - lxvx 34, 7, 4 - lxvx 32, 8, 4 + lxvd2x 37, 6, 4 + lxvd2x 34, 7, 4 + lxvd2x 32, 8, 4 vxor 3, 3, 3 vspltish 4, 1 li 4, -128 @@ -333,21 +330,21 @@ MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) li 10, 192 li 11, 208 li 12, 224 - lxvx 52, 6, 1 - lxvx 53, 7, 1 - lxvx 54, 8, 1 - lxvx 55, 9, 1 - lxvx 56, 10, 1 - lxvx 57, 11, 1 - lxvx 58, 12, 1 + lxvd2x 52, 6, 1 + lxvd2x 53, 7, 1 + lxvd2x 54, 8, 1 + lxvd2x 55, 9, 1 + lxvd2x 56, 10, 1 + lxvd2x 57, 11, 1 + lxvd2x 58, 12, 1 li 6, 240 li 7, 256 li 8, 272 li 9, 288 - lxvx 59, 6, 1 - lxvx 60, 7, 1 - lxvx 61, 8, 1 - lxvx 62, 9, 1 + lxvd2x 59, 6, 1 + lxvd2x 60, 7, 1 + lxvd2x 61, 8, 1 + lxvd2x 62, 9, 1 mtlr 0 addi 1, 1, 320 blr diff --git a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S index f084651bff..25814d05bb 100644 --- a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S +++ b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S @@ -1,12 +1,9 @@ /* * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT * - * Copyright IBM Corp. 2025, 2026 - * - *=================================================================================== * Written by Danny Tsen - * */ #include "../../../common.h" #if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ @@ -32,16 +29,16 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) li 8, 160 li 9, 176 li 10, 192 - stxvx 52, 6, 1 - stxvx 53, 7, 1 - stxvx 54, 8, 1 - stxvx 55, 9, 1 - stxvx 56, 10, 1 + stxvd2x 52, 6, 1 + stxvd2x 53, 7, 1 + stxvd2x 54, 8, 1 + stxvd2x 55, 9, 1 + stxvd2x 56, 10, 1 vxor 7, 7, 7 li 6, 32 li 7, 48 - lxvx 35, 6, 4 - lxvx 32, 7, 4 + lxvd2x 35, 6, 4 + lxvd2x 32, 7, 4 vspltisw 2, 13 vadduwm 2, 2, 2 vspltisw 4, 1 @@ -694,11 +691,11 @@ MLK_ASM_FN_SYMBOL(reduce_ppc_asm) li 8, 160 li 9, 176 li 10, 192 - lxvx 52, 6, 1 - lxvx 53, 7, 1 - lxvx 54, 8, 1 - lxvx 55, 9, 1 - lxvx 56, 10, 1 + lxvd2x 52, 6, 1 + lxvd2x 53, 7, 1 + lxvd2x 54, 8, 1 + lxvd2x 55, 9, 1 + lxvd2x 56, 10, 1 mtlr 0 addi 1, 1, 224 blr diff --git a/test/mk/components.mk b/test/mk/components.mk index e083005f8e..01dd79e98e 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,8 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) - SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) - SOURCES += $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif From fca498bf2757a372859c700760f12a0776a8408e Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Wed, 20 May 2026 14:59:57 +0200 Subject: [PATCH 24/27] Updates after merge Signed-off-by: Basil Hess --- dev/ppc64le/src/consts.c | 2 +- dev/ppc64le/src/consts.h | 2 +- mlkem/src/native/ppc64le/src/consts.c | 2 +- mlkem/src/native/ppc64le/src/consts.h | 2 +- scripts/autogen | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index 6f3f71eff4..a97a924396 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -10,7 +10,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[] = { +MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { /* -Q */ /* check-magic: -3329 == -1 * MLKEM_Q */ -3329, diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index 1622e8ccbb..2f57b2af89 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -24,7 +24,7 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) -extern const int16_t mlk_ppc_qdata[]; +MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; #endif #endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index 6f3f71eff4..a97a924396 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -10,7 +10,7 @@ #include "consts.h" -MLK_ALIGN const int16_t mlk_ppc_qdata[] = { +MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { /* -Q */ /* check-magic: -3329 == -1 * MLKEM_Q */ -3329, diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index e72b954cf9..7391d5586f 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -24,7 +24,7 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) -extern const int16_t mlk_ppc_qdata[]; +MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; #endif #endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/scripts/autogen b/scripts/autogen index 6bc51d06bc..8f0fd575a6 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -3117,8 +3117,8 @@ def update_via_simpasm( "-o", tmp.name, ] - # TODO: Support CFI for Armv8.1-M - if arch != "armv81m": + # TODO: Support CFI for ppc64le + if arch != "ppc64le": cmd += ["--cfify"] if cross_prefix is not None: # Stick with llvm-objdump for disassembly From 6b242cedf2b540b950ca07f37cddf4094814648a Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Thu, 21 May 2026 09:31:02 +0200 Subject: [PATCH 25/27] ppc64le: fix Python and magic-constant linting - ruff format scripts/autogen (formatting fix) - Add check-magic annotation for array size 2072 in consts.c and consts.h (7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values) Signed-off-by: Basil Hess --- dev/ppc64le/src/consts.c | 2 ++ dev/ppc64le/src/consts.h | 4 +++- mlkem/src/native/ppc64le/src/consts.c | 2 ++ mlkem/src/native/ppc64le/src/consts.h | 4 +++- scripts/autogen | 10 +++++++++- 5 files changed, 19 insertions(+), 3 deletions(-) diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c index a97a924396..4065b60231 100644 --- a/dev/ppc64le/src/consts.c +++ b/dev/ppc64le/src/consts.c @@ -10,6 +10,8 @@ #include "consts.h" +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { /* -Q */ /* check-magic: -3329 == -1 * MLKEM_Q */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h index 2f57b2af89..851cd3392e 100644 --- a/dev/ppc64le/src/consts.h +++ b/dev/ppc64le/src/consts.h @@ -24,7 +24,9 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; -#endif +#endif /* !__ASSEMBLER__ */ #endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c index a97a924396..4065b60231 100644 --- a/mlkem/src/native/ppc64le/src/consts.c +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -10,6 +10,8 @@ #include "consts.h" +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { /* -Q */ /* check-magic: -3329 == -1 * MLKEM_Q */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h index 7391d5586f..e35f89bce2 100644 --- a/mlkem/src/native/ppc64le/src/consts.h +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -24,7 +24,9 @@ #ifndef __ASSEMBLER__ #define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; -#endif +#endif /* !__ASSEMBLER__ */ #endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/scripts/autogen b/scripts/autogen index 8f0fd575a6..d87b470eca 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -3587,7 +3587,15 @@ def synchronize_backends( delete=delete, force_cross=force_cross, no_simplify=no_simplify, - cflags=" ".join(filter(None, [extra_cflags, "-Idev/ppc64le/src -Imlkem/src/native/ppc64le/src -mcpu=power8"])), + cflags=" ".join( + filter( + None, + [ + extra_cflags, + "-Idev/ppc64le/src -Imlkem/src/native/ppc64le/src -mcpu=power8", + ], + ) + ), ) From 344278639c1620b3152838ddd22224b507754994 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Thu, 21 May 2026 13:27:17 +0200 Subject: [PATCH 26/27] ppc64le: fix power7 fallback by compiling for power7 but running for power8 (nix libc is compiled for power8 and otherwise causes illegal instructions. Avoids unused data parameter errors in the fallback code path. Signed-off-by: Basil Hess --- .github/actions/multi-functest/action.yml | 4 ++-- dev/ppc64le/meta.h | 4 ++++ mlkem/src/native/ppc64le/meta.h | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index dc72b25f25..468f2925ab 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -172,7 +172,7 @@ runs: alloc: ${{ inputs.alloc }} rng_fail: ${{ inputs.rng_fail }} extra_args: ${{ inputs.extra_args }} - - name: Cross ppc64le Tests (POWER7) + - name: Cross ppc64le Tests (POWER7 fallback path) if: ${{ (inputs.compile_mode == 'all' || inputs.compile_mode == 'cross-ppc64le') && (success() || failure()) }} uses: ./.github/actions/functest with: @@ -184,7 +184,7 @@ runs: cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mcpu=power7" ldflags: ${{ inputs.ldflags }} cross_prefix: powerpc64le-unknown-linux-gnu- - exec_wrapper: "qemu-ppc64le -cpu power7" + exec_wrapper: "qemu-ppc64le -cpu power8" opt: ${{ inputs.opt }} func: ${{ inputs.func }} kat: ${{ inputs.kat }} diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h index 616c14f53c..baa95db2d4 100644 --- a/dev/ppc64le/meta.h +++ b/dev/ppc64le/meta.h @@ -32,6 +32,7 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -43,6 +44,7 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -54,6 +56,7 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -65,6 +68,7 @@ static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h index 4a3018c2ac..9cd3b66cdd 100644 --- a/mlkem/src/native/ppc64le/meta.h +++ b/mlkem/src/native/ppc64le/meta.h @@ -32,6 +32,7 @@ static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) mlk_ntt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -43,6 +44,7 @@ static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) mlk_intt_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -54,6 +56,7 @@ static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) mlk_reduce_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } @@ -65,6 +68,7 @@ static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); return MLK_NATIVE_FUNC_SUCCESS; #else + (void)data; return MLK_NATIVE_FUNC_FALLBACK; #endif } From 12036a9426a543a452939cf6917a64d9f66a9b43 Mon Sep 17 00:00:00 2001 From: Basil Hess Date: Thu, 21 May 2026 13:51:44 +0200 Subject: [PATCH 27/27] ppc64le: workaround to avoid hol-light failures Signed-off-by: Basil Hess --- flake.nix | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.nix b/flake.nix index 7640cf0d2e..660577dc3d 100644 --- a/flake.nix +++ b/flake.nix @@ -114,13 +114,13 @@ packages = builtins.attrValues { inherit (config.packages) linters hol_light s2n_bignum hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchains hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchains toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-aarch64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-x86_64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.ci = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchains_native; };