Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions BIBLIOGRAPHY.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ source code and documentation.
* Referenced from:
- [README.md](README.md)

### `ADOMNICAI23`

* An update on Keccak performance on ARMv7-M
* Author(s):
- Alexandre Adomnicai
* URL: https://eprint.iacr.org/2023/773
* Referenced from:
- [dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S](dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S)
- [mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S](mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S)

### `AVX2_NTT`

* Faster AVX2 optimized NTT multiplication for Ring-LWE lattice cryptography.
Expand Down Expand Up @@ -284,6 +294,18 @@ source code and documentation.
- [dev/README.md](dev/README.md)
- [proofs/hol_light/README.md](proofs/hol_light/README.md)

### `SLOTHYM7`

* Enabling Microarchitectural Agility: Taking ML-KEM & ML-DSA from Cortex-M4 to M7 with SLOTHY
* Author(s):
- Amin Abdulrahman
- Matthias J. Kannwischer
- Thing-Han Lim
* URL: https://eprint.iacr.org/2025/366
* Referenced from:
- [dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S](dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S)
- [mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S](mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S)

### `SLOTHY_Paper`

* Fast and Clean: Auditable high-performance assembly via constraint solving
Expand All @@ -308,6 +330,20 @@ source code and documentation.
- [proofs/hol_light/aarch64/mlkem/mlkem_intt.S](proofs/hol_light/aarch64/mlkem/mlkem_intt.S)
- [proofs/hol_light/aarch64/mlkem/mlkem_ntt.S](proofs/hol_light/aarch64/mlkem/mlkem_ntt.S)

### `XKCP`

* eXtended Keccak Code Package
* Author(s):
- Guido Bertoni
- Joan Daemen
- Michaël Peeters
- Gilles Van Assche
- Ronny Van Keer
* URL: https://github.com/XKCP/XKCP
* Referenced from:
- [dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S](dev/fips202/armv81m/src/keccak_f1600_x1_armv7m.S)
- [mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S](mlkem/src/fips202/native/armv81m/src/keccak_f1600_x1_armv7m.S)

### `clangover`

* clangover
Expand Down
23 changes: 23 additions & 0 deletions BIBLIOGRAPHY.yml
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,26 @@
name: tiny_sha3
author: Saarinen, Markku-Juhani O.
url: https://github.com/mjosaarinen/tiny_sha3

- id: XKCP
name: eXtended Keccak Code Package
author:
- Bertoni, Guido
- Daemen, Joan
- Peeters, Michaël
- Van Assche, Gilles
- Van Keer, Ronny
url: https://github.com/XKCP/XKCP

- id: ADOMNICAI23
name: "An update on Keccak performance on ARMv7-M"
author: Adomnicai, Alexandre
url: https://eprint.iacr.org/2023/773

- id: SLOTHYM7
name: "Enabling Microarchitectural Agility: Taking ML-KEM & ML-DSA from Cortex-M4 to M7 with SLOTHY"
author:
- Abdulrahman, Amin
- Kannwischer, Matthias J.
- Lim, Thing-Han
url: https://eprint.iacr.org/2025/366
56 changes: 55 additions & 1 deletion dev/fips202/armv81m/mve.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,30 @@
#define MLK_FIPS202_NATIVE_ARMV81M

/* Part of backend API */
#define MLK_USE_FIPS202_X1_NATIVE
#define MLK_USE_FIPS202_X4_NATIVE
#define MLK_USE_FIPS202_X1_XOR_BYTES_NATIVE
#define MLK_USE_FIPS202_X1_EXTRACT_BYTES_NATIVE
#define MLK_USE_FIPS202_X4_XOR_BYTES_NATIVE
#define MLK_USE_FIPS202_X4_EXTRACT_BYTES_NATIVE
/* Guard for assembly file */

/* Guard for assembly files */
#define MLK_FIPS202_ARMV81M_NEED_X1
#define MLK_FIPS202_ARMV81M_NEED_X4

#if !defined(__ASSEMBLER__)
#include "../api.h"

#define mlk_keccak_f1600_x1_native_impl \
MLK_NAMESPACE(keccak_f1600_x1_native_impl)
int mlk_keccak_f1600_x1_native_impl(uint64_t *state);

MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_keccak_f1600_x1_native(uint64_t *state)
{
return mlk_keccak_f1600_x1_native_impl(state);
}

/*
* Native x4 permutation
* State is kept in bit-interleaved format.
Expand All @@ -32,6 +47,45 @@ static MLK_INLINE int mlk_keccak_f1600_x4_native(uint64_t *state)
return mlk_keccak_f1600_x4_native_impl(state);
}

/*
* Native x1 XOR bytes (with on-the-fly bit interleaving)
*/
#define mlk_keccak_f1600_x1_state_xor_bytes_impl \
MLK_NAMESPACE(mlk_keccak_f1600_x1_state_xor_bytes_impl)
void mlk_keccak_f1600_x1_state_xor_bytes_impl(uint64_t *state,
const uint8_t *data,
unsigned offset, unsigned length);

MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_keccakf1600_xor_bytes_x1_native(uint64_t *state,
const uint8_t *data,
unsigned offset,
unsigned length)
{
mlk_keccak_f1600_x1_state_xor_bytes_impl(state, data, offset, length);
return MLK_NATIVE_FUNC_SUCCESS;
}

/*
* Native x1 extract bytes (with on-the-fly bit de-interleaving)
*/
#define mlk_keccak_f1600_x1_state_extract_bytes_impl \
MLK_NAMESPACE(mlk_keccak_f1600_x1_state_extract_bytes_impl)
void mlk_keccak_f1600_x1_state_extract_bytes_impl(uint64_t *state,
uint8_t *data,
unsigned offset,
unsigned length);

MLK_MUST_CHECK_RETURN_VALUE
static MLK_INLINE int mlk_keccakf1600_extract_bytes_x1_native(uint64_t *state,
uint8_t *data,
unsigned offset,
unsigned length)
{
mlk_keccak_f1600_x1_state_extract_bytes_impl(state, data, offset, length);
return MLK_NATIVE_FUNC_SUCCESS;
}

/*
* Native x4 XOR bytes (with on-the-fly bit interleaving)
*/
Expand Down
117 changes: 117 additions & 0 deletions dev/fips202/armv81m/src/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright (c) The mlkem-native project authors
# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT

.PHONY: all purge FORCE
.DEFAULT_GOAL := all

# ISA to optimize for
TARGET_ISA=Arm_v81M

# MicroArch target to optimize for
TARGET_MICROARCH=Arm_Cortex_M55
keccak_f1600_x4_mve.S: ../../armv81m_symbolic/keccak_f1600_x4_mve_clean.S
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMP) \
-s keccak_f1600_x4_mve_asm_roundstart \
-e keccak_f1600_x4_mve_asm_roundend_pre \
-c unsafe_address_offset_fixup=False \
-c inputs_are_outputs=True \
-c constraints.functional_only=True \
-c constraints.allow_reordering=True \
-c constraints.max_displacement=0.1
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMP) -o $@ \
-s keccak_f1600_x4_mve_asm_roundstart \
-e keccak_f1600_x4_mve_asm_roundend_pre \
-c constraints.functional_only=False \
-c constraints.allow_reordering=True \
-c variable_size=True \
-c inputs_are_outputs=True \
-c constraints.stalls_first_attempt=64 \
-c constraints.max_displacement=1.0 \
-c constraints.stalls_maximum_attempt=4096 \
-c unsafe_address_offset_fixup=False \
-c split_heuristic=True \
-c split_heuristic_stepsize=0.05 \
-c split_heuristic_factor=26 \
-c split_heuristic_repeat=2 \
-c split_heuristic_estimate_performance=False \
-c split_heuristic_optimize_seam=2
rm -f $(TMP)

# -----------------------------------------------------------------------------
# SLOTHY optimization for x1 MVE byte ops
# -----------------------------------------------------------------------------
# We optimize only the hot main loops, keeping prologue/epilogue intact.
# These targets operate in-place on the dev sources using a temporary file.
# The FORCE prerequisite ensures they rebuild when explicitly requested.

state_xor_bytes_x1_mve_asm.S: ../../armv81m_symbolic/src/state_xor_bytes_x1_mve_asm.S
$(eval TMP := $(shell mktemp))
# Pass 1: functional constraints only, stabilize schedule
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMP) \
-s keccak_f1600_x1_state_xor_bytes_asm_main_loop_start \
-e keccak_f1600_x1_state_xor_bytes_asm_main_loop_end_pre \
-c unsafe_address_offset_fixup=False \
-c inputs_are_outputs=True \
-c constraints.functional_only=True \
-c constraints.allow_reordering=True \
-c constraints.max_displacement=0.1
# Pass 2: full optimization with splitting heuristics
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMP) -o $@ \
-s keccak_f1600_x1_state_xor_bytes_asm_main_loop_start \
-e keccak_f1600_x1_state_xor_bytes_asm_main_loop_end_pre \
-c constraints.functional_only=False \
-c constraints.allow_reordering=True \
-c variable_size=True \
-c inputs_are_outputs=True \
-c constraints.stalls_first_attempt=64 \
-c constraints.max_displacement=1.0 \
-c constraints.stalls_maximum_attempt=4096 \
-c unsafe_address_offset_fixup=False \
-c split_heuristic=True \
-c split_heuristic_stepsize=0.05 \
-c split_heuristic_factor=26 \
-c split_heuristic_repeat=2 \
-c split_heuristic_estimate_performance=False \
-c split_heuristic_optimize_seam=2
rm -f $(TMP)

state_extract_bytes_x1_mve_asm.S: ../../armv81m_symbolic/src/state_extract_bytes_x1_mve_asm.S
$(eval TMP := $(shell mktemp))
# Pass 1: functional constraints only, stabilize schedule
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMP) \
-s keccak_f1600_x1_state_extract_bytes_asm_main_loop_start \
-e keccak_f1600_x1_state_extract_bytes_asm_main_loop_end_pre \
-c unsafe_address_offset_fixup=False \
-c inputs_are_outputs=True \
-c constraints.functional_only=True \
-c constraints.allow_reordering=True \
-c constraints.max_displacement=0.1
# Pass 2: full optimization with splitting heuristics
slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMP) -o $@ \
-s keccak_f1600_x1_state_extract_bytes_asm_main_loop_start \
-e keccak_f1600_x1_state_extract_bytes_asm_main_loop_end_pre \
-c constraints.functional_only=False \
-c constraints.allow_reordering=True \
-c variable_size=True \
-c inputs_are_outputs=True \
-c constraints.stalls_first_attempt=64 \
-c constraints.max_displacement=1.0 \
-c constraints.stalls_maximum_attempt=4096 \
-c unsafe_address_offset_fixup=False \
-c split_heuristic=True \
-c split_heuristic_stepsize=0.05 \
-c split_heuristic_factor=26 \
-c split_heuristic_repeat=2 \
-c split_heuristic_estimate_performance=False \
-c split_heuristic_optimize_seam=2
rm -f $(TMP)

ALL=state_xor_bytes_x1_mve_asm.S \
state_extract_bytes_x1_mve_asm.S
all: $(ALL)

purge:
rm -rf $(ALL)

# Phony prerequisite used to force re-optimization when invoked explicitly.
FORCE:
21 changes: 19 additions & 2 deletions dev/fips202/armv81m/src/fips202_native_armv81m.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

#include "../../../../common.h"

/* Keccak round constants in bit-interleaved form */
/* Keccak round constants in bit-interleaved form (with terminator for x1) */
#define mlk_keccakf1600_round_constants \
MLK_NAMESPACE(keccakf1600_round_constants)
extern const uint32_t mlk_keccakf1600_round_constants[48];
extern const uint32_t mlk_keccakf1600_round_constants[49];

#define mlk_keccak_f1600_x4_mve_asm MLK_NAMESPACE(keccak_f1600_x4_mve_asm)
void mlk_keccak_f1600_x4_mve_asm(uint64_t state[100], uint64_t tmpstate[100],
Expand All @@ -32,4 +32,21 @@ void mlk_keccak_f1600_x4_state_extract_bytes_asm(void *state, uint8_t *data0,
unsigned offset,
unsigned length);

#define mlk_keccak_f1600_x1_armv7m_asm MLK_NAMESPACE(keccak_f1600_x1_armv7m_asm)
void mlk_keccak_f1600_x1_armv7m_asm(uint32_t state[50], const uint32_t rc[49]);

#define mlk_keccak_f1600_x1_state_xor_bytes_asm \
MLK_NAMESPACE(keccak_f1600_x1_state_xor_bytes_asm)
void mlk_keccak_f1600_x1_state_xor_bytes_asm(uint64_t *state,
const uint8_t *data,
unsigned offset, unsigned length);

#define mlk_keccak_f1600_x1_state_extract_bytes_asm \
MLK_NAMESPACE(keccak_f1600_x1_state_extract_bytes_asm)
void mlk_keccak_f1600_x1_state_extract_bytes_asm(uint64_t *state,
const uint8_t *data,
unsigned offset,
unsigned length);


#endif /* !MLK_DEV_FIPS202_ARMV81M_SRC_FIPS202_NATIVE_ARMV81M_H */
Loading
Loading