diff --git a/.github/workflows/ci_ec2_reusable.yml b/.github/workflows/ci_ec2_reusable.yml index c00001b5e0..ed7e55a639 100644 --- a/.github/workflows/ci_ec2_reusable.yml +++ b/.github/workflows/ci_ec2_reusable.yml @@ -207,7 +207,7 @@ jobs: nix-cache: true nix-shell: slothy script: | - autogen --slothy + autogen --slothy aarch64 tests all --opt opt # Force testing of SLOTHY-optimized Keccak variants # We can't run the examples here because some of them also specify the backend diff --git a/dev/fips202/armv81m/src/Makefile b/dev/fips202/armv81m/src/Makefile new file mode 100644 index 0000000000..ccc8c22bf3 --- /dev/null +++ b/dev/fips202/armv81m/src/Makefile @@ -0,0 +1,47 @@ +# Copyright (c) The mlkem-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +.PHONY: all purge +.DEFAULT_GOAL := all + +# ISA to optimize for +TARGET_ISA=Arm_v81M + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Cortex_M55 + +keccak_f1600_x4_mve.S: ../../armv81m_symbolic/keccak_f1600_x4_mve_clean.S + $(eval TMP := $(shell mktemp)) + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMP) \ + -s keccak_f1600_x4_mve_asm_roundstart \ + -e keccak_f1600_x4_mve_asm_roundend_pre \ + -c unsafe_address_offset_fixup=False \ + -c inputs_are_outputs=True \ + -c constraints.functional_only=True \ + -c constraints.allow_reordering=True \ + -c constraints.max_displacement=0.1 + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMP) -o $@ \ + -s keccak_f1600_x4_mve_asm_roundstart \ + -e keccak_f1600_x4_mve_asm_roundend_pre \ + -c constraints.functional_only=False \ + -c constraints.allow_reordering=True \ + -c variable_size=True \ + -c inputs_are_outputs=True \ + -c constraints.stalls_first_attempt=64 \ + -c constraints.max_displacement=1.0 \ + -c constraints.stalls_maximum_attempt=4096 \ + -c unsafe_address_offset_fixup=False \ + -c split_heuristic=True \ + -c split_heuristic_stepsize=0.05 \ + -c split_heuristic_factor=26 \ + -c split_heuristic_repeat=2 \ + -c split_heuristic_estimate_performance=False \ + -c split_heuristic_optimize_seam=2 + rm -f $(TMP) + +ALL=keccak_f1600_x4_mve.S + +all: $(ALL) + +purge: + rm -rf $(ALL) diff --git a/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S b/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S new file mode 100644 index 0000000000..93fcccb3ed --- /dev/null +++ b/dev/fips202/armv81m_symbolic/keccak_f1600_x4_mve_clean.S @@ -0,0 +1,481 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) 2025 Arm Limited + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/*yaml + Name: keccak_f1600_x4_mve_asm + Description: Armv8.1-M MVE implementation of 4-way parallel Keccak-f[1600] permutation using bit-interleaved state + Signature: void mlk_keccak_f1600_x4_mve_asm(void *state, void *tmpstate, const uint32_t *rc) + ABI: + r0: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *state + description: Four bit-interleaved Keccak states (low halves followed by high halves) + r1: + type: buffer + size_bytes: 800 + permissions: read/write + c_parameter: void *tmpstate + description: Temporary storage for intermediate state + r2: + type: buffer + size_bytes: 192 + permissions: read + c_parameter: const uint32_t *rc + description: Keccak round constants in bit-interleaved form (24 pairs of 32-bit words) + Stack: + bytes: 236 + description: register preservation (44) + SIMD registers (64) + temporary storage (128) +*/ + +#include "../../../../common.h" +#if defined(MLK_FIPS202_ARMV81M_NEED_X4) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) +/* simpasm: header-end */ + +.thumb +.syntax unified +.text +.equ QSTACK0, 0 +.equ A__00, 0 +.equ A__01, 80 +.equ A__02, 160 +.equ A__03, 240 +.equ A__04, 320 +.equ A__10, 16 +.equ A__11, 96 +.equ A__12, 176 +.equ A__13, 256 +.equ A__14, 336 +.equ A__20, 32 +.equ A__21, 112 +.equ A__22, 192 +.equ A__23, 272 +.equ A__24, 352 +.equ A__30, 48 +.equ A__31, 128 +.equ A__32, 208 +.equ A__33, 288 +.equ A__34, 368 +.equ A__40, 64 +.equ A__41, 144 +.equ A__42, 224 +.equ A__43, 304 +.equ A__44, 384 +.equ B__00, 0 +.equ B__01, 256 +.equ B__02, 112 +.equ B__03, 368 +.equ B__04, 224 +.equ B__10, 160 +.equ B__11, 16 +.equ B__12, 272 +.equ B__13, 128 +.equ B__14, 384 +.equ B__20, 320 +.equ B__21, 176 +.equ B__22, 32 +.equ B__23, 288 +.equ B__24, 144 +.equ B__30, 80 +.equ B__31, 336 +.equ B__32, 192 +.equ B__33, 48 +.equ B__34, 304 +.equ B__40, 240 +.equ B__41, 96 +.equ B__42, 352 +.equ B__43, 208 +.equ B__44, 64 +.equ RCxy_00, 0 +.equ RCxy_01, 36 +.equ RCxy_02, 3 +.equ RCxy_03, 41 +.equ RCxy_04, 18 +.equ RCxy_10, 1 +.equ RCxy_11, 44 +.equ RCxy_12, 10 +.equ RCxy_13, 45 +.equ RCxy_14, 2 +.equ RCxy_20, 62 +.equ RCxy_21, 6 +.equ RCxy_22, 43 +.equ RCxy_23, 15 +.equ RCxy_24, 61 +.equ RCxy_30, 28 +.equ RCxy_31, 55 +.equ RCxy_32, 25 +.equ RCxy_33, 21 +.equ RCxy_34, 56 +.equ RCxy_40, 27 +.equ RCxy_41, 20 +.equ RCxy_42, 39 +.equ RCxy_43, 8 +.equ RCxy_44, 14 + +qA00_h .req q0 +qA00_l .req q1 +qA20_l .req q2 + +.macro ld_xor5 state, round, x, C, A + vldrw.u32 q<\C>, [\state, #A__\x\()0] // @slothy:reads=A\state\()__\x\()0 + vldrw.u32 q<\A>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + +.macro ld_xor5_0 state, round, x, C, A, A0 + vldrw.u32 q<\C>, [\state, #A__\x\()1] // @slothy:reads=A\state\()__\x\()1 + veor q<\C>, q<\C>, q<\A0> + vldrw.u32 q<\A>, [\state, #A__\x\()2] // @slothy:reads=A\state\()__\x\()2 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()3] // @slothy:reads=A\state\()__\x\()3 + veor q<\C>, q<\C>, q<\A> + vldrw.u32 q<\A>, [\state, #A__\x\()4] // @slothy:reads=A\state\()__\x\()4 + veor q<\C>, q<\C>, q<\A> + .endm + + +.macro rot1_xor_l D1_l, C0_l, C2_h + vshr.u32 q<\D1_l>, q<\C2_h>, #31 + vsli.32 q<\D1_l>, q<\C2_h>, #1 + veor q<\D1_l>, q<\D1_l>, q<\C0_l> + .endm + +.macro rot1_xor_h D1_h, C0_h, C2_l + veor q<\D1_h>, q<\C2_l>, q<\C0_h> + .endm + +.macro rot_str_e s_l, s_h, A_l, A_h, RC, x, y + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q, q, #32-(\RC/2) + vsli.u32 q, q, #\RC/2 + vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro rot_str_o s_l, s_h, A_l, A_h, RC, x, y + .if (\RC-1)/2 == 0 + vstrw.32 q, [\s_h, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC-1)/2) + vsli.u32 q, q, #(\RC-1)/2 + vstrw.32 q, [\s_h, #B__\x\()\y] + .endif + + .if (\RC+1)/2 == 0 + // should never happen + vstrw.32 q, [\s_l, #B__\x\()\y] + .else + vshr.u32 q, q, #32-((\RC+1)/2) + vsli.u32 q, q, #(\RC+1)/2 + vstrw.32 q, [\s_l, #B__\x\()\y] + .endif +.endm + +.macro ld_xorD_rot_str_e state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro rot_str_e_0 s_l, s_h, A_l, A_h, RC, x, y, regl, regh + vshr.u32 q<\regl>, q, #32-(\RC/2) + vsli.u32 q<\regl>, q, #\RC/2 + //vstrw.32 q, [\s_l, #B__\x\()\y] + vshr.u32 q<\regh>, q, #32-(\RC/2) + vsli.u32 q<\regh>, q, #\RC/2 + //vstrw.32 q, [\s_h, #B__\x\()\y] +.endm + +.macro ld_xorD_rot_str_e_0 state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h, regl, regh + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_e_0 \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y, \regl, \regh +.endm + +.macro ld_xorD_rot_str_o state_l, state_h, state_nl, state_nh, x, y, Dx_l, Dx_h + vldrw.u32 q, [\state_l, #A__\x\()\y] // @slothy:reads=A\state_l\()__\x\()\y + vldrw.u32 q, [\state_h, #A__\x\()\y] // @slothy:reads=A\state_h\()__\x\()\y + veor q, q, q<\Dx_l> + veor q, q, q<\Dx_h> + rot_str_o \state_nl, \state_nh, A_l, A_h, RCxy_\x\()\y, \x, \y +.endm + +.macro ld_bic_str state, state_n, round, y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_bic_str_0 state, state_n round, y, A0 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_bic_str_1 state, state_n, round, y, A0, A2 + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q<\A2>, q, q + vstrw.32 q<\A2>, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y + vbic q, q, q + veor q<\A0>, q, q + // A0 is stored later after the round-constant is added +.endm + +.macro ld_1_bic_str state, state_n, round, y, B1 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + // vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q<\B1> + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q, q + veor q, q<\B1>, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q<\B1>, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + +.macro ld_3_bic_str state, state_n, round, y, B3 + vldrw.u32 q, [\state_n, #A__0\y] // @slothy:reads=A\state_n\()__0\y + vldrw.u32 q, [\state_n, #A__1\y] // @slothy:reads=A\state_n\()__1\y + vldrw.u32 q, [\state_n, #A__2\y] // @slothy:reads=A\state_n\()__2\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__0\y] // @slothy:writes=A\state\()__0\y + // vldrw.u32 q, [\state_n, #A__3\y] // @slothy:reads=A\state_n\()__3\y + vbic q, q<\B3>, q + veor q, q, q + vstrw.32 q, [\state, #A__1\y] // @slothy:writes=A\state\()__1\y + vldrw.u32 q, [\state_n, #A__4\y] // @slothy:reads=A\state_n\()__4\y + vbic q, q, q<\B3> + veor q, q, q + vstrw.32 q, [\state, #A__2\y] // @slothy:writes=A\state\()__2\y + vbic q, q, q + veor q, q<\B3>, q + vstrw.32 q, [\state, #A__3\y] // @slothy:writes=A\state\()__3\y + vbic q, q, q + veor q, q, q + vstrw.32 q, [\state, #A__4\y] // @slothy:writes=A\state\()__4\y +.endm + + + +.macro keccak_4fold_round_theta_rho_pi state_l, state_h, state_nl, state_nh, rc + ld_xor5_0 \state_h, 0, 0, C0_h, A0_h, qA00_h + ld_xor5_0 \state_l, 0, 2, C2_l, A2_l, qA20_l + rot1_xor_h D1_h, C0_h, C2_l + vstrw.32 q, [r13, #QSTACK0] // @slothy:writes=stack0 + + ld_xor5_0 \state_l, 0, 0, C0_l, A0_l, qA00_l + ld_xor5 \state_h, 0, 2, C2_h, A2_h + rot1_xor_l D1_l, C0_l, C2_h + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 0, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 1, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 2, D1_l, D1_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 1, 3, D1_l, D1_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 1, 4, D1_l, D1_h + + ld_xor5 \state_h, 0, 4, C4_h, A4_h + rot1_xor_l D3_l, C2_l, C4_h + + ld_xor5 \state_l, 0, 4, C4_l, A4_l + rot1_xor_h D3_h, C2_h, C4_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 0, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 1, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 2, D3_l, D3_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 3, 3, D3_l, D3_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 3, 4, D3_l, D3_h + + ld_xor5 \state_h, 0, 1, C1_h, A1_h + rot1_xor_l D0_l, C4_l, C1_h + ld_xor5 \state_l, 0, 1, C1_l, A1_l + rot1_xor_h D0_h, C4_h, C1_l + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 0, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 1, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 2, D0_l, D0_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 0, 3, D0_l, D0_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 0, 4, D0_l, D0_h + + ld_xor5 \state_l, 0, 3, C3_l, A3_l + rot1_xor_h D2_h, C1_h, C3_l + ld_xor5 \state_h, 0, 3, C3_h, A3_h + rot1_xor_l D2_l, C1_l, C3_h + + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 0, D2_l, D2_h + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 2, 1, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 2, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 3, D2_l, D2_h + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 2, 4, D2_l, D2_h + + rot1_xor_h D4_h, C3_h, C0_l + vldrw.32 q, [r13, #QSTACK0] // @slothy:reads=stack0 + rot1_xor_l D4_l, C3_l, C0_h + + + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 0, D4_l, D4_h // B40 = A03 + ld_xorD_rot_str_o \state_l, \state_h, \state_nl, \state_nh, 4, 2, D4_l, D4_h // B42 = A24 + ld_xorD_rot_str_e \state_l, \state_h, \state_nl, \state_nh, 4, 4, D4_l, D4_h // B44 = A40 + // A11_l, A11_h, A32_l are held in registers from the next step + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 3, D4_l, D4_h, A32_l, A32_h // B43 = A32 + vstrw.32 q, [\state_nh, #B__43] + ld_xorD_rot_str_e_0 \state_l, \state_h, \state_nl, \state_nh, 4, 1, D4_l, D4_h, A11_l, A11_h // B41 = A11 +.endm + +.macro keccak_4fold_round_chi_iota state_l, state_h, state_nl, state_nh, rc // now BIC + // A11_l, A11_h, A32_l are held in registers from the previous step + ld_1_bic_str \state_l, \state_nl, 0, 1, A11_l + ld_1_bic_str \state_h, \state_nh, 0, 1, A11_h + + ld_3_bic_str \state_l, \state_nl, 0, 2, A32_l + ld_bic_str \state_h, \state_nh, 0, 2 + + ld_bic_str \state_l, \state_nl, 0, 3 + ld_bic_str \state_h, \state_nh, 0, 3 + + ld_bic_str \state_l, \state_nl, 0, 4 + ld_bic_str \state_h, \state_nh, 0, 4 + + ld_bic_str_1 \state_l, \state_nl, 0, 0, A00_l, qA20_l + ld_bic_str_0 \state_h, \state_nh, 0, 0, A00_h + + + ldrd r, r, [\rc] + vdup.32 q, r + veor qA00_l, q, q + vstrw.32 qA00_l, [\state_l, #A__00] // @slothy:writes=A\state_l\()__00 + vdup.32 q, r + veor qA00_h, q, q + vstrw.32 qA00_h, [\state_h, #A__00] // @slothy:writes=A\state_h\()__00 +.endm + +.text +.balign 8 +.type MLK_ASM_NAMESPACE(keccak_f1600_x4_mve_asm), %function +.global MLK_ASM_NAMESPACE(keccak_f1600_x4_mve_asm) +MLK_ASM_FN_SYMBOL(keccak_f1600_x4_mve_asm) + + push {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + vpush {d8-d15} + sub sp, #8*16 + + mov r6, r2 // r6 = rc table pointer (from r2 parameter) + + // r0: state 0 + // r1: state 1 + // r2: this state low (reused from rc parameter) + // r3: this state high + // r4: next state low + // r5: next state high + // r6: rc table + + + mov lr, #24 + + mov r2, r0 + mov r4, r1 + + // pre-fetch so we can keep in registers between rounds + add r3, r2, #400 + vldrw.u32 qA00_h, [r3, #A__00] + vldrw.u32 qA00_l, [r2, #A__00] + vldrw.u32 qA20_l, [r2, #A__20] + + wls lr, lr, keccak_f1600_x4_mve_asm_roundend +keccak_f1600_x4_mve_asm_roundstart: + add r3, r2, #400 + add r5, r4, #400 + keccak_4fold_round_theta_rho_pi r2, r3, r4, r5, r6 + keccak_4fold_round_chi_iota r2, r3, r4, r5, r6 + + add r6, r6, #8 +keccak_f1600_x4_mve_asm_roundend_pre: + le lr, keccak_f1600_x4_mve_asm_roundstart +keccak_f1600_x4_mve_asm_roundend: + add sp, #8*16 + + vpop {d8-d15} + ldmia.w sp!, {r3,r4,r5,r6,r7,r8,r9,r10,r11,r12, pc} + +/****************** REGISTER DEALLOCATIONS *******************/ + .unreq qA00_h + .unreq qA00_l + .unreq qA20_l + +/* simpasm: footer-start */ +#endif /* MLK_FIPS202_ARMV81M_NEED_X4 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/nix/slothy/default.nix b/nix/slothy/default.nix index d657a3d7ea..a14aa78204 100644 --- a/nix/slothy/default.nix +++ b/nix/slothy/default.nix @@ -17,12 +17,12 @@ let in stdenvNoCC.mkDerivation rec { pname = "slothy-cli"; - version = "915c224166207ce07b31152194305c3b6687d09b"; + version = "0.1.10"; src = fetchFromGitHub { owner = "slothy-optimizer"; repo = "slothy"; rev = version; - sha256 = "sha256-ebZjm+nhmML/+DZF78eN1ezoxbv5Rrc5kWsh0Ycww4U="; + sha256 = "sha256-zGZEH+lixwVeH+f03pC6etO2xCF1vRH/RnmyYaKf4kI="; }; nativeBuildInputs = [ pkgs.makeWrapper ]; diff --git a/scripts/autogen b/scripts/autogen index 7a611b7911..3487d6aa29 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -3550,7 +3550,9 @@ def gen_slothy(funcs): for t in targets: - if t.startswith("keccak"): + if t.startswith("keccak") and "mve" in t: + base = "dev/fips202/armv81m/src" + elif t.startswith("keccak"): base = "dev/fips202/aarch64/src" else: base = "dev/aarch64_opt/src" @@ -4160,7 +4162,7 @@ def gen_test_configs(): def _main(): - slothy_choices = [ + slothy_aarch64_targets = [ "ntt", "intt", "poly_tobytes_asm", @@ -4175,6 +4177,11 @@ def _main(): "keccak_f1600_x4_v8a_scalar_hybrid_asm", "keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm", ] + slothy_armv81m_targets = [ + "keccak_f1600_x4_mve", + ] + slothy_all_targets = slothy_aarch64_targets + slothy_armv81m_targets + slothy_choices = slothy_all_targets + ["aarch64", "armv81m"] parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter @@ -4200,7 +4207,18 @@ def _main(): os.chdir(os.path.join(os.path.dirname(__file__), "..")) if args.slothy == []: - args.slothy = slothy_choices + args.slothy = slothy_all_targets + elif args.slothy is not None: + # Expand group names to their targets + expanded = [] + for t in args.slothy: + if t == "aarch64": + expanded.extend(slothy_aarch64_targets) + elif t == "armv81m": + expanded.extend(slothy_armv81m_targets) + else: + expanded.append(t) + args.slothy = expanded def sync_backends(): synchronize_backends(