From d0a737c4c5c0fd2b610af4f2fc1055f9b200c2ff Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 28 Jan 2026 16:30:46 +0800 Subject: [PATCH] Armv8.1-M: Add clean x4 Keccak code and SLOTHY Makefile Add the clean (non-optimized) Keccak x4 assembly and Makefile to enable regeneration of the SLOTHY-optimized assembly. Extend autogen --slothy to support new function, and also add --slothy armv81m and --slothy aarch64 group options. Update CI to only run aarch64 SLOTHY targets. Optimization of Armv8.1-M assembly is not currently tested in CI, but has been tested locally. Update SLOTHY to version 0.1.10 as the code requires disabling the address offset fixup which is not supported for Armv8.1-M in the older versionf of SLOTHY. Co-Authored-By: Brendan Moran Signed-off-by: Matthias J. Kannwischer --- .github/workflows/ci_ec2_reusable.yml | 2 +- dev/fips202/armv81m/src/Makefile | 47 ++ .../keccak_f1600_x4_mve_clean.S | 481 ++++++++++++++++++ nix/slothy/default.nix | 4 +- scripts/autogen | 24 +- 5 files changed, 552 insertions(+), 6 deletions(-) create mode 100644 dev/fips202/armv81m/src/Makefile create mode 100644 dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S diff --git a/.github/workflows/ci_ec2_reusable.yml b/.github/workflows/ci_ec2_reusable.yml index c00001b5e0..ed7e55a639 100644 --- a/.github/workflows/ci_ec2_reusable.yml +++ b/.github/workflows/ci_ec2_reusable.yml @@ -207,7 +207,7 @@ jobs: nix-cache: true nix-shell: slothy script: | - autogen --slothy + autogen --slothy aarch64 tests all --opt opt # Force testing of SLOTHY-optimized Keccak variants # We can't run the examples here because some of them also specify the backend diff --git a/dev/fips202/armv81m/src/Makefile b/dev/fips202/armv81m/src/Makefile new file mode 100644 index 0000000000..ccc8c22bf3 --- /dev/null +++ b/dev/fips202/armv81m/src/Makefile @@ -0,0 +1,47 @@ +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +.PHONY: all purge +.DEFAULT_GOAL := all + +# ISA to optimize for +TARGET_ISA=Arm_v81M + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Cortex_M55 + +keccak_f1600_x4_mve.S: ../../armv81m_symbolic/keccak_f1600_x4_mve_clean.S + $(eval TMP := $(shell mktemp)) + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMP) \ + -s keccak_f1600_x4_mve_asm_roundstart \ + -e keccak_f1600_x4_mve_asm_roundend_pre \ + -c unsafe_address_offset_fixup=False \ + -c inputs_are_outputs=True \ + -c constraints.functional_only=True \ + -c constraints.allow_reordering=True \ + -c constraints.max_displacement=0.1 + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMP) -o $@ \ + -s keccak_f1600_x4_mve_asm_roundstart \ + -e keccak_f1600_x4_mve_asm_roundend_pre \ + -c constraints.functional_only=False \ + -c constraints.allow_reordering=True \ + -c variable_size=True \ + -c inputs_are_outputs=True \ + -c constraints.stalls_first_attempt=64 \ + -c constraints.max_displacement=1.0 \ + -c constraints.stalls_maximum_attempt=4096 \ + -c unsafe_address_offset_fixup=False \ + -c split_heuristic=True \ + -c split_heuristic_stepsize=0.05 \ + -c split_heuristic_factor=26 \ + -c split_heuristic_repeat=2 \ + -c split_heuristic_estimate_performance=False \ + -c split_heuristic_optimize_seam=2 + rm -f $(TMP) + +ALL=keccak_f1600_x4_mve.S + +all: $(ALL) + +purge: + rm -rf $(ALL) diff --git a/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S b/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S new file mode 100644 index 0000000000..93fcccb3ed --- /dev/null +++ b/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S @@ -0,0 +1,481 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2025 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: keccak_f1600_x4_mve_asm + Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Signature: void mlk_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) + ABI: + r0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *state + description: Four bit-interleaved Keccak states (low halves followed by high halves) + r1: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *tmpstate + description: Temporary storage for intermediate state + r2: + type: buffer + size_bytes: 192 + permissions: read + c_parameter: const uint32_t *rc + description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words) + Stack: + bytes: 236 + description: register preservation (44) + SIMD registers (64) + temporary storage (128) +*/ + +#include "../../../../common.h" +#if defined(MLK_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.thumb +.syntax unified +.text +.equ QSTACK0, 0 +.equ A__00, 0 +.equ A__01, 80 +.equ A__02, 160 +.equ A__03, 240 +.equ A__04, 320 +.equ A__10, 16 +.equ A__11, 96 +.equ A__12, 176 +.equ A__13, 256 +.equ A__14, 336 +.equ A__20, 32 +.equ A__21, 112 +.equ A__22, 192 +.equ A__23, 272 +.equ A__24, 352 +.equ A__30, 48 +.equ A__31, 128 +.equ A__32, 208 +.equ A__33, 288 +.equ A__34, 368 +.equ A__40, 64 +.equ A__41, 144 +.equ A__42, 224 +.equ A__43, 304 +.equ A__44, 384 +.equ B__00, 0 +.equ B__01, 256 +.equ B__02, 112 +.equ B__03, 368 +.equ B__04, 224 +.equ B__10, 160 +.equ B__11, 16 +.equ B__12, 272 +.equ B__13, 128 +.equ B__14, 384 +.equ B__20, 320 +.equ B__21, 176 +.equ B__22, 32 +.equ B__23, 288 +.equ B__24, 144 +.equ B__30, 80 +.equ B__31, 336 +.equ B__32, 192 +.equ B__33, 48 +.equ B__34, 304 +.equ B__40, 240 +.equ B__41, 96 +.equ B__42, 352 +.equ B__43, 208 +.equ B__44, 64 +.equ RCxy_00, 0 +.equ RCxy_01, 36 +.equ RCxy_02, 3 +.equ RCxy_03, 41 +.equ RCxy_04, 18 +.equ RCxy_10, 1 +.equ RCxy_11, 44 +.equ RCxy_12, 10 +.equ RCxy_13, 45 +.equ RCxy_14, 2 +.equ RCxy_20, 62 +.equ RCxy_21, 6 +.equ RCxy_22, 43 +.equ RCxy_23, 15 +.equ RCxy_24, 61 +.equ RCxy_30, 28 +.equ RCxy_31, 55 +.equ RCxy_32, 25 +.equ RCxy_33, 21 +.equ RCxy_34, 56 +.equ RCxy_40, 27 +.equ RCxy_41, 20 +.equ RCxy_42, 39 +.equ RCxy_43, 8 +.equ RCxy_44, 14 + +qA00_h .req q0 +qA00_l .req q1 +qA20_l .req q2 + +.macro ld_xor5 state, round, x, C, A + vldrw.u32 q<\C>, [\state, #A__\x\()0] // @slothy:reads=A\state\()__\x\()0 + vldrw.u32 q<\A>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + +.macro ld_xor5_0 state, round, x, C, A, A0 + vldrw.u32 q<\C>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A0> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + + +.macro rot1_xor_l D1_l, C0_l, C2_h + vshr.u32 q<\D1_l>, q<\C2_h>, #31 + vsli.32 q<\D1_l>, q<\C2_h>, #1 + veor q<\D1_l>, q<\D1_l>, q<\C0_l> + .endm + +.macro rot1_xor_h D1_h, C0_h, C2_l + veor q<\D1_h>, q<\C2_l>, q<\C0_h> + .endm + +.macro rot_str_e s_l, s_h, A_l, A_h, RC, x, y + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro rot_str_o s_l, s_h, A_l, A_h, RC, x, y + .if (\RC-1)/2 == 0 + vstrw.32 q, [\s_h, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC-1)/2) + vsli.u32 q, q, #(\RC-1)/2 + vstrw.32 q, [\s_h, #B__\x\()\y] + .endif + + .if (\RC+1)/2 == 0 + // should never happen + vstrw.32 q, [\s_l, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC+1)/2) + vsli.u32 q, q, #(\RC+1)/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + .endif +.endm + +.macro ld_xorD_rot_str_e state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro rot_str_e_0 s_l, s_h, A_l, A_h, RC, x, y, regl, regh + vshr.u32 q<\regl>, q, #32-(\RC/2) + vsli.u32 q<\regl>, q, #\RC/2 + //vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q<\regh>, q, #32-(\RC/2) + vsli.u32 q<\regh>, q, #\RC/2 + //vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro ld_xorD_rot_str_e_0 state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h, regl, regh + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e_0 \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y, \regl, \regh +.endm + +.macro ld_xorD_rot_str_o state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_o \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro ld_bic_str state, state_n, round, y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_bic_str_0 state, state_n round, y, A0 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_bic_str_1 state, state_n, round, y, A0, A2 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q<\A2>, q, q + vstrw.32 q<\A2>, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_1_bic_str state, state_n, round, y, B1 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + // vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q<\B1> + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q<\B1>, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q<\B1>, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_3_bic_str state, state_n, round, y, B3 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + // vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q<\B3>, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q<\B3> + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q<\B3>, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + + + +.macro keccak_4fold_round_theta_rho_pi state_l, state_h, state_nl, state_nh, rc + ld_xor5_0 \state_h, 0, 0, C0_h, A0_h, qA00_h + ld_xor5_0 \state_l, 0, 2, C2_l, A2_l, qA20_l + rot1_xor_h D1_h, C0_h, C2_l + vstrw.32 q, [r13, #QSTACK0] // @slothy:writes=stack0 + + ld_xor5_0 \state_l, 0, 0, C0_l, A0_l, qA00_l + ld_xor5 \state_h, 0, 2, C2_h, A2_h + rot1_xor_l D1_l, C0_l, C2_h + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 0, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 1, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 2, D1_l, D1_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 3, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 4, D1_l, D1_h + + ld_xor5 \state_h, 0, 4, C4_h, A4_h + rot1_xor_l D3_l, C2_l, C4_h + + ld_xor5 \state_l, 0, 4, C4_l, A4_l + rot1_xor_h D3_h, C2_h, C4_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 0, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 1, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 2, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 3, D3_l, D3_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 4, D3_l, D3_h + + ld_xor5 \state_h, 0, 1, C1_h, A1_h + rot1_xor_l D0_l, C4_l, C1_h + ld_xor5 \state_l, 0, 1, C1_l, A1_l + rot1_xor_h D0_h, C4_h, C1_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 0, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 1, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 2, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 3, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 4, D0_l, D0_h + + ld_xor5 \state_l, 0, 3, C3_l, A3_l + rot1_xor_h D2_h, C1_h, C3_l + ld_xor5 \state_h, 0, 3, C3_h, A3_h + rot1_xor_l D2_l, C1_l, C3_h + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 0, D2_l, D2_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 1, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 2, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 3, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 4, D2_l, D2_h + + rot1_xor_h D4_h, C3_h, C0_l + vldrw.32 q, [r13, #QSTACK0] // @slothy:reads=stack0 + rot1_xor_l D4_l, C3_l, C0_h + + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 0, D4_l, D4_h // B40 = A03 + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 2, D4_l, D4_h // B42 = A24 + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 4, 4, D4_l, D4_h // B44 = A40 + // A11_l, A11_h, A32_l are held in registers from the next step + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 3, D4_l, D4_h, A32_l, A32_h // B43 = A32 + vstrw.32 q, [\state_nh, #B__43] + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 1, D4_l, D4_h, A11_l, A11_h // B41 = A11 +.endm + +.macro keccak_4fold_round_chi_iota state_l, state_h, state_nl, state_nh, rc // now BIC + // A11_l, A11_h, A32_l are held in registers from the previous step + ld_1_bic_str \state_l, \state_nl, 0, 1, A11_l + ld_1_bic_str \state_h, \state_nh, 0, 1, A11_h + + ld_3_bic_str \state_l, \state_nl, 0, 2, A32_l + ld_bic_str \state_h, \state_nh, 0, 2 + + ld_bic_str \state_l, \state_nl, 0, 3 + ld_bic_str \state_h, \state_nh, 0, 3 + + ld_bic_str \state_l, \state_nl, 0, 4 + ld_bic_str \state_h, \state_nh, 0, 4 + + ld_bic_str_1 \state_l, \state_nl, 0, 0, A00_l, qA20_l + ld_bic_str_0 \state_h, \state_nh, 0, 0, A00_h + + + ldrd r, r, [\rc] + vdup.32 q, r + veor qA00_l, q, q + vstrw.32 qA00_l, [\state_l, #A__00] // @slothy:writes=A\state_l\()__00 + vdup.32 q, r + veor qA00_h, q, q + vstrw.32 qA00_h, [\state_h, #A__00] // @slothy:writes=A\state_h\()__00 +.endm + +.text +.balign 8 +.type MLK_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function +.global MLK_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) + + push {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + vpush {d8-d15} + sub sp, #8*16 + + mov r6, r2 // r6 = rc table pointer (from r2 parameter) + + // r0: state 0 + // r1: state 1 + // r2: this state low (reused from rc parameter) + // r3: this state high + // r4: next state low + // r5: next state high + // r6: rc table + + + mov lr, #24 + + mov r2, r0 + mov r4, r1 + + // pre-fetch so we can keep in registers between rounds + add r3, r2, #400 + vldrw.u32 qA00_h, [r3, #A__00] + vldrw.u32 qA00_l, [r2, #A__00] + vldrw.u32 qA20_l, [r2, #A__20] + + wls lr, lr, keccak_f1600_x4_mve_asm_roundend +keccak_f1600_x4_mve_asm_roundstart: + add r3, r2, #400 + add r5, r4, #400 + keccak_4fold_round_theta_rho_pi r2, r3, r4, r5, r6 + keccak_4fold_round_chi_iota r2, r3, r4, r5, r6 + + add r6, r6, #8 +keccak_f1600_x4_mve_asm_roundend_pre: + le lr, keccak_f1600_x4_mve_asm_roundstart +keccak_f1600_x4_mve_asm_roundend: + add sp, #8*16 + + vpop {d8-d15} + ldmia.w sp!, {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12, pc} + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq qA00_h + .unreq qA00_l + .unreq qA20_l + +/* simpasm: footer-start */ +#endif /* MLK_FIPS202_ARMV81M_NEED_X4 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/nix/slothy/default.nix b/nix/slothy/default.nix index d657a3d7ea..a14aa78204 100644 --- a/nix/slothy/default.nix +++ b/nix/slothy/default.nix @@ -17,12 +17,12 @@ let in stdenvNoCC.mkDerivation rec { pname = "slothy-cli"; - version = "915c224166207ce07b31152194305c3b6687d09b"; + version = "0.1.10"; src = fetchFromGitHub { owner = "slothy-optimizer"; repo = "slothy"; rev = version; - sha256 = "sha256-ebZjm+nhmML/+DZF78eN1ezoxbv5Rrc5kWsh0Ycww4U="; + sha256 = "sha256-zGZEH+lixwVeH+f03pC6etO2xCF1vRH/RnmyYaKf4kI="; }; nativeBuildInputs = [ pkgs.makeWrapper ]; diff --git a/scripts/autogen b/scripts/autogen index 7a611b7911..3487d6aa29 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -3550,7 +3550,9 @@ def gen_slothy(funcs): for t in targets: - if t.startswith("keccak"): + if t.startswith("keccak") and "mve" in t: + base = "dev/fips202/armv81m/src" + elif t.startswith("keccak"): base = "dev/fips202/aarch64/src" else: base = "dev/aarch64_opt/src" @@ -4160,7 +4162,7 @@ def gen_test_configs(): def _main(): - slothy_choices = [ + slothy_aarch64_targets = [ "ntt", "intt", "poly_tobytes_asm", @@ -4175,6 +4177,11 @@ def _main(): "keccak_f1600_x4_v8a_scalar_hybrid_asm", "keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm", ] + slothy_armv81m_targets = [ + "keccak_f1600_x4_mve", + ] + slothy_all_targets = slothy_aarch64_targets + slothy_armv81m_targets + slothy_choices = slothy_all_targets + ["aarch64", "armv81m"] parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -4200,7 +4207,18 @@ def _main(): os.chdir(os.path.join(os.path.dirname(__file__), "..")) if args.slothy == []: - args.slothy = slothy_choices + args.slothy = slothy_all_targets + elif args.slothy is not None: + # Expand group names to their targets + expanded = [] + for t in args.slothy: + if t == "aarch64": + expanded.extend(slothy_aarch64_targets) + elif t == "armv81m": + expanded.extend(slothy_armv81m_targets) + else: + expanded.append(t) + args.slothy = expanded def sync_backends(): synchronize_backends(