diff --git a/.github/actions/multi-functest/action.yml b/.github/actions/multi-functest/action.yml index 3e3837a7f6..468f2925ab 100644 --- a/.github/actions/multi-functest/action.yml +++ b/.github/actions/multi-functest/action.yml @@ -147,7 +147,7 @@ runs: rng_fail: ${{ inputs.rng_fail }} extra_args: ${{ inputs.extra_args }} extra_env: ${{ inputs.extra_env }} - - name: Cross ppc64le Tests + - name: Cross ppc64le Tests (POWER8) if: ${{ (inputs.compile_mode == 'all' || inputs.compile_mode == 'cross-ppc64le') && (success() || failure()) }} uses: ./.github/actions/functest with: @@ -156,10 +156,35 @@ runs: nix-verbose: ${{ inputs.nix-verbose }} gh_token: ${{ inputs.gh_token }} custom_shell: ${{ inputs.custom_shell }} - cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE" + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mcpu=power8" ldflags: ${{ inputs.ldflags }} cross_prefix: powerpc64le-unknown-linux-gnu- - exec_wrapper: qemu-ppc64le + exec_wrapper: "qemu-ppc64le -cpu power8" + opt: ${{ inputs.opt }} + func: ${{ inputs.func }} + kat: ${{ inputs.kat }} + unit: ${{ inputs.unit }} + acvp: ${{ inputs.acvp }} + wycheproof: ${{ inputs.wycheproof }} + examples: ${{ inputs.examples }} + check_namespace: ${{ inputs.check_namespace }} + stack: ${{ inputs.stack }} + alloc: ${{ inputs.alloc }} + rng_fail: ${{ inputs.rng_fail }} + extra_args: ${{ inputs.extra_args }} + - name: Cross ppc64le Tests (POWER7 fallback path) + if: ${{ (inputs.compile_mode == 'all' || inputs.compile_mode == 'cross-ppc64le') && (success() || failure()) }} + uses: ./.github/actions/functest + with: + nix-shell: ${{ inputs.nix-shell }} + nix-cache: ${{ inputs.nix-cache }} + nix-verbose: ${{ inputs.nix-verbose }} + gh_token: ${{ inputs.gh_token }} + custom_shell: ${{ inputs.custom_shell }} + cflags: "${{ inputs.cflags }} -DMLK_FORCE_PPC64LE -mcpu=power7" + ldflags: ${{ inputs.ldflags }} + cross_prefix: powerpc64le-unknown-linux-gnu- + exec_wrapper: "qemu-ppc64le -cpu power8" opt: ${{ inputs.opt }} func: ${{ inputs.func }} kat: ${{ inputs.kat }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5beafb34c8..5f09473c59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,8 +154,8 @@ jobs: check_namespace: 'false' - name: build + test (cross, opt) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} @@ -164,8 +164,8 @@ jobs: opt: 'opt' - name: build + test (cross, opt, +debug) uses: ./.github/actions/multi-functest - # There is no native code yet on PPC64LE, riscv32 or AArch64_be, so no point running opt tests - if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'ppc64le' && matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} + # There is no native code yet on riscv32 or AArch64_be, so no point running opt tests + if: ${{ matrix.target.mode != 'native' && (matrix.target.arch != 'riscv32' && matrix.target.arch != 'aarch64_be') }} with: nix-shell: ${{ matrix.target.nix_shell }} nix-cache: ${{ matrix.target.mode == 'native' && 'false' || 'true' }} diff --git a/dev/ppc64le/README.md b/dev/ppc64le/README.md new file mode 100644 index 0000000000..def9e7e7ef --- /dev/null +++ b/dev/ppc64le/README.md @@ -0,0 +1,7 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. +Or, Power systems supporting ISA 2.07 and above. + diff --git a/dev/ppc64le/meta.h b/dev/ppc64le/meta.h new file mode 100644 index 0000000000..baa95db2d4 --- /dev/null +++ b/dev/ppc64le/meta.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_META_H +#define MLK_DEV_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_ntt_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_intt_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_reduce_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_META_H */ diff --git a/dev/ppc64le/src/arith_native_ppc64le.h b/dev/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..5cf1c3b0f4 --- /dev/null +++ b/dev/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc_asm MLK_NAMESPACE(ntt_ppc_asm) +void mlk_ntt_ppc_asm(int16_t *, const int16_t *); + +#define mlk_intt_ppc_asm MLK_NAMESPACE(intt_ppc_asm) +void mlk_intt_ppc_asm(int16_t *, const int16_t *); + +#define mlk_reduce_ppc_asm MLK_NAMESPACE(reduce_ppc_asm) +void mlk_reduce_ppc_asm(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc_asm MLK_NAMESPACE(poly_tomont_ppc_asm) +void mlk_poly_tomont_ppc_asm(int16_t *, const int16_t *); + +#endif /* !MLK_DEV_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/dev/ppc64le/src/consts.c b/dev/ppc64le/src/consts.c new file mode 100644 index 0000000000..4065b60231 --- /dev/null +++ b/dev/ppc64le/src/consts.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +#include "consts.h" + +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ +MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* N^-1 in Montgomery form: pow(128,-1,MLKEM_Q) * 2^16 mod MLKEM_Q = 512. + * Multiplying by this via Barrett-fqmul scales INTT output by N^-1 and + * leaves it in Montgomery form (mlk_poly_invntt_tomont contract). */ + 512, + 512, + 512, + 512, + 512, + 512, + 512, + 512, + /* check-magic: 5040 == round((512 * 2**16 + MLKEM_Q) / MLKEM_Q) // 2 */ + /* Barrett twist of N^-1*R = round_to_even(N_INV_MONT * 2^16 / MLKEM_Q) / 2 + */ + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" +/* zetas for invNTT */ +#include "consts_intt.inc" +/* twisted zetas for NTT (Barrett high-mul) */ +#include "consts_ntt_tw.inc" +/* twisted zetas for invNTT (Barrett high-mul) */ +#include "consts_intt_tw.inc" +}; +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/consts.h b/dev/ppc64le/src/consts.h new file mode 100644 index 0000000000..851cd3392e --- /dev/null +++ b/dev/ppc64le/src/consts.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_DEV_PPC64LE_SRC_CONSTS_H +#define MLK_DEV_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define MLK_PPC_NQ_OFFSET 0 +#define MLK_PPC_QINV_OFFSET 16 +#define MLK_PPC_Q_OFFSET 32 +#define MLK_PPC_C20159_OFFSET 48 +#define MLK_PPC_N_INV_OFFSET 64 +#define MLK_PPC_N_INV_TW_OFFSET 80 +#define MLK_PPC_C1353_OFFSET 96 +#define MLK_PPC_ZETA_NTT_OFFSET 112 +#define MLK_PPC_ZETA_INTT_OFFSET 1120 +#define MLK_PPC_ZETA_NTT_TW_OFFSET 2128 +#define MLK_PPC_ZETA_INTT_TW_OFFSET 3136 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ +MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_DEV_PPC64LE_SRC_CONSTS_H */ diff --git a/dev/ppc64le/src/consts_intt.inc b/dev/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..057b1df249 --- /dev/null +++ b/dev/ppc64le/src/consts_intt.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -394, -394, -1175, -1175, -1219, -1219, 885, 885, + 1212, 1212, 1029, 1029, -1607, -1607, -1455, -1455, + -1179, -1179, 886, 886, 1143, 1143, -554, -554, + 1092, 1092, 1026, 1026, -525, -525, 403, 403, + 561, 561, -735, -735, -1230, -1230, -863, -863, + 319, 319, 757, 757, 1063, 1063, -556, -556, + -780, -780, 1645, 1645, 375, 375, -1239, -1239, + -1031, -1031, -109, -109, 1584, 1584, -1292, -1292, + -992, -992, 641, 641, 733, 733, 268, 268, + -1021, -1021, -941, -941, 939, 939, -892, -892, + 952, 952, -642, -642, -1482, -1482, 1461, 1461, + 1651, 1651, -1540, -1540, -1626, -1626, -540, -540, + -1173, -1173, -279, -279, 756, 756, -314, -314, + -667, -667, 233, 233, 1409, 1409, -48, -48, + 723, 723, 1100, 1100, 1637, 1637, -1041, -1041, + -568, -568, -680, -680, 17, 17, 583, 583, + 1227, 1227, 1227, 1227, 910, 910, 910, 910, + -855, -855, -855, -855, -219, -219, -219, -219, + 1481, 1481, 1481, 1481, 648, 648, 648, 648, + -682, -682, -682, -682, -712, -712, -712, -712, + 1534, 1534, 1534, 1534, -927, -927, -927, -927, + 1438, 1438, 1438, 1438, -461, -461, -461, -461, + 807, 807, 807, 807, 452, 452, 452, 452, + -1010, -1010, -1010, -1010, 1435, 1435, 1435, 1435, + 1320, 1320, 1320, 1320, -1414, -1414, -1414, -1414, + -464, -464, -464, -464, 33, 33, 33, 33, + -816, -816, -816, -816, 632, 632, 632, 632, + 650, 650, 650, 650, -1352, -1352, -1352, -1352, + -1052, -1052, -1052, -1052, -1274, -1274, -1274, -1274, + 1197, 1197, 1197, 1197, -1025, -1025, -1025, -1025, + -76, -76, -76, -76, -1573, -1573, -1573, -1573, + 289, 289, 289, 289, 331, 331, 331, 331, + 821, 821, 821, 821, 821, 821, 821, 821, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + -450, -450, -450, -450, -450, -450, -450, -450, + -936, -936, -936, -936, -936, -936, -936, -936, + -447, -447, -447, -447, -447, -447, -447, -447, + 535, 535, 535, 535, 535, 535, 535, 535, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 56, 56, 56, 56, 56, 56, 56, 56, + -283, -283, -283, -283, -283, -283, -283, -283, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + -882, -882, -882, -882, -882, -882, -882, -882, + 296, 296, 296, 296, 296, 296, 296, 296, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 569, 569, 569, 569, 569, 569, 569, 569, + -69, -69, -69, -69, -69, -69, -69, -69, + -543, -543, -543, -543, -543, -543, -543, -543, + 797, 797, 797, 797, 797, 797, 797, 797, + 193, 193, 193, 193, 193, 193, 193, 193, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + 848, 848, 848, 848, 848, 848, 848, 848, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 630, 630, 630, 630, 630, 630, 630, 630, + -687, -687, -687, -687, -687, -687, -687, -687, + -40, -40, -40, -40, -40, -40, -40, -40, + -749, -749, -749, -749, -749, -749, -749, -749, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, diff --git a/dev/ppc64le/src/consts_intt_tw.inc b/dev/ppc64le/src/consts_intt_tw.inc new file mode 100644 index 0000000000..783e0af8f2 --- /dev/null +++ b/dev/ppc64le/src/consts_intt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -3878, -3878, -11566, -11566, -11999, -11999, 8711, 8711, + 11930, 11930, 10129, 10129, -15818, -15818, -14322, -14322, + -11605, -11605, 8721, 8721, 11251, 11251, -5453, -5453, + 10749, 10749, 10099, 10099, -5168, -5168, 3967, 3967, + 5522, 5522, -7235, -7235, -12107, -12107, -8495, -8495, + 3140, 3140, 7451, 7451, 10463, 10463, -5473, -5473, + -7678, -7678, 16192, 16192, 3691, 3691, -12196, -12196, + -10148, -10148, -1073, -1073, 15592, 15592, -12717, -12717, + -9764, -9764, 6309, 6309, 7215, 7215, 2638, 2638, + -10050, -10050, -9262, -9262, 9243, 9243, -8780, -8780, + 9371, 9371, -6319, -6319, -14588, -14588, 14381, 14381, + 16251, 16251, -15159, -15159, -16005, -16005, -5315, -5315, + -11546, -11546, -2746, -2746, 7441, 7441, -3091, -3091, + -6565, -6565, 2293, 2293, 13869, 13869, -472, -472, + 7117, 7117, 10828, 10828, 16113, 16113, -10247, -10247, + -5591, -5591, -6693, -6693, 167, 167, 5739, 5739, + 12078, 12078, 12078, 12078, 8957, 8957, 8957, 8957, + -8416, -8416, -8416, -8416, -2156, -2156, -2156, -2156, + 14578, 14578, 14578, 14578, 6378, 6378, 6378, 6378, + -6713, -6713, -6713, -6713, -7008, -7008, -7008, -7008, + 15099, 15099, 15099, 15099, -9125, -9125, -9125, -9125, + 14155, 14155, 14155, 14155, -4538, -4538, -4538, -4538, + 7943, 7943, 7943, 7943, 4449, 4449, 4449, 4449, + -9942, -9942, -9942, -9942, 14125, 14125, 14125, 14125, + 12993, 12993, 12993, 12993, -13918, -13918, -13918, -13918, + -4567, -4567, -4567, -4567, 325, 325, 325, 325, + -8032, -8032, -8032, -8032, 6221, 6221, 6221, 6221, + 6398, 6398, 6398, 6398, -13308, -13308, -13308, -13308, + -10355, -10355, -10355, -10355, -12540, -12540, -12540, -12540, + 11782, 11782, 11782, 11782, -10089, -10089, -10089, -10089, + -748, -748, -748, -748, -15483, -15483, -15483, -15483, + 2845, 2845, 2845, 2845, 3258, 3258, 3258, 3258, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 551, 551, 551, 551, 551, 551, 551, 551, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -679, -679, -679, -679, -679, -679, -679, -679, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + -394, -394, -394, -394, -394, -394, -394, -394, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, diff --git a/dev/ppc64le/src/consts_ntt.inc b/dev/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..e53bf13713 --- /dev/null +++ b/dev/ppc64le/src/consts_ntt.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, + -749, -749, -749, -749, -749, -749, -749, -749, + -40, -40, -40, -40, -40, -40, -40, -40, + -687, -687, -687, -687, -687, -687, -687, -687, + 630, 630, 630, 630, 630, 630, 630, 630, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 848, 848, 848, 848, 848, 848, 848, 848, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 193, 193, 193, 193, 193, 193, 193, 193, + 797, 797, 797, 797, 797, 797, 797, 797, + -543, -543, -543, -543, -543, -543, -543, -543, + -69, -69, -69, -69, -69, -69, -69, -69, + 569, 569, 569, 569, 569, 569, 569, 569, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 296, 296, 296, 296, 296, 296, 296, 296, + -882, -882, -882, -882, -882, -882, -882, -882, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + -283, -283, -283, -283, -283, -283, -283, -283, + 56, 56, 56, 56, 56, 56, 56, 56, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 535, 535, 535, 535, 535, 535, 535, 535, + -447, -447, -447, -447, -447, -447, -447, -447, + -936, -936, -936, -936, -936, -936, -936, -936, + -450, -450, -450, -450, -450, -450, -450, -450, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + 821, 821, 821, 821, 821, 821, 821, 821, + 331, 331, 331, 331, 289, 289, 289, 289, + -1573, -1573, -1573, -1573, -76, -76, -76, -76, + -1025, -1025, -1025, -1025, 1197, 1197, 1197, 1197, + -1274, -1274, -1274, -1274, -1052, -1052, -1052, -1052, + -1352, -1352, -1352, -1352, 650, 650, 650, 650, + 632, 632, 632, 632, -816, -816, -816, -816, + 33, 33, 33, 33, -464, -464, -464, -464, + -1414, -1414, -1414, -1414, 1320, 1320, 1320, 1320, + 1435, 1435, 1435, 1435, -1010, -1010, -1010, -1010, + 452, 452, 452, 452, 807, 807, 807, 807, + -461, -461, -461, -461, 1438, 1438, 1438, 1438, + -927, -927, -927, -927, 1534, 1534, 1534, 1534, + -712, -712, -712, -712, -682, -682, -682, -682, + 648, 648, 648, 648, 1481, 1481, 1481, 1481, + -219, -219, -219, -219, -855, -855, -855, -855, + 910, 910, 910, 910, 1227, 1227, 1227, 1227, + 583, 583, 17, 17, -680, -680, -568, -568, + -1041, -1041, 1637, 1637, 1100, 1100, 723, 723, + -48, -48, 1409, 1409, 233, 233, -667, -667, + -314, -314, 756, 756, -279, -279, -1173, -1173, + -540, -540, -1626, -1626, -1540, -1540, 1651, 1651, + 1461, 1461, -1482, -1482, -642, -642, 952, 952, + -892, -892, 939, 939, -941, -941, -1021, -1021, + 268, 268, 733, 733, 641, 641, -992, -992, + -1292, -1292, 1584, 1584, -109, -109, -1031, -1031, + -1239, -1239, 375, 375, 1645, 1645, -780, -780, + -556, -556, 1063, 1063, 757, 757, 319, 319, + -863, -863, -1230, -1230, -735, -735, 561, 561, + 403, 403, -525, -525, 1026, 1026, 1092, 1092, + -554, -554, 1143, 1143, 886, 886, -1179, -1179, + -1455, -1455, -1607, -1607, 1029, 1029, 1212, 1212, + 885, 885, -1219, -1219, -1175, -1175, -394, -394, diff --git a/dev/ppc64le/src/consts_ntt_tw.inc b/dev/ppc64le/src/consts_ntt_tw.inc new file mode 100644 index 0000000000..a191b3bf2f --- /dev/null +++ b/dev/ppc64le/src/consts_ntt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -394, -394, -394, -394, -394, -394, -394, -394, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + -679, -679, -679, -679, -679, -679, -679, -679, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 551, 551, 551, 551, 551, 551, 551, 551, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + 3258, 3258, 3258, 3258, 2845, 2845, 2845, 2845, + -15483, -15483, -15483, -15483, -748, -748, -748, -748, + -10089, -10089, -10089, -10089, 11782, 11782, 11782, 11782, + -12540, -12540, -12540, -12540, -10355, -10355, -10355, -10355, + -13308, -13308, -13308, -13308, 6398, 6398, 6398, 6398, + 6221, 6221, 6221, 6221, -8032, -8032, -8032, -8032, + 325, 325, 325, 325, -4567, -4567, -4567, -4567, + -13918, -13918, -13918, -13918, 12993, 12993, 12993, 12993, + 14125, 14125, 14125, 14125, -9942, -9942, -9942, -9942, + 4449, 4449, 4449, 4449, 7943, 7943, 7943, 7943, + -4538, -4538, -4538, -4538, 14155, 14155, 14155, 14155, + -9125, -9125, -9125, -9125, 15099, 15099, 15099, 15099, + -7008, -7008, -7008, -7008, -6713, -6713, -6713, -6713, + 6378, 6378, 6378, 6378, 14578, 14578, 14578, 14578, + -2156, -2156, -2156, -2156, -8416, -8416, -8416, -8416, + 8957, 8957, 8957, 8957, 12078, 12078, 12078, 12078, + 5739, 5739, 167, 167, -6693, -6693, -5591, -5591, + -10247, -10247, 16113, 16113, 10828, 10828, 7117, 7117, + -472, -472, 13869, 13869, 2293, 2293, -6565, -6565, + -3091, -3091, 7441, 7441, -2746, -2746, -11546, -11546, + -5315, -5315, -16005, -16005, -15159, -15159, 16251, 16251, + 14381, 14381, -14588, -14588, -6319, -6319, 9371, 9371, + -8780, -8780, 9243, 9243, -9262, -9262, -10050, -10050, + 2638, 2638, 7215, 7215, 6309, 6309, -9764, -9764, + -12717, -12717, 15592, 15592, -1073, -1073, -10148, -10148, + -12196, -12196, 3691, 3691, 16192, 16192, -7678, -7678, + -5473, -5473, 10463, 10463, 7451, 7451, 3140, 3140, + -8495, -8495, -12107, -12107, -7235, -7235, 5522, 5522, + 3967, 3967, -5168, -5168, 10099, 10099, 10749, 10749, + -5453, -5453, 11251, 11251, 8721, 8721, -11605, -11605, + -14322, -14322, -15818, -15818, 10129, 10129, 11930, 11930, + 8711, 8711, -11999, -11999, -11566, -11566, -3878, -3878, diff --git a/dev/ppc64le/src/intt_ppc_asm.S b/dev/ppc64le/src/intt_ppc_asm.S new file mode 100644 index 0000000000..f0eaa12de5 --- /dev/null +++ b/dev/ppc64le/src/intt_ppc_asm.S @@ -0,0 +1,844 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) +/* simpasm: header-end */ + +#include "consts.h" + +.text + +/* Barrett-Q-reduce constants */ +#define V20159 0 +#define V2pw25 1 +#define V_26 2 +#define V_MKQ 3 + +/* Shared zero vector (aliases V_MKQ; V_MKQ is only live during + * barrett_reduce_4x, where V_ZERO/zero is reloaded from vs3 anyway). */ +#define V_ZERO 3 + +/* Barrett-multiply constants */ +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V_NINV 10 +#define V_NINV_TW 11 + +/* Barrett twisted zetas: zt = round_to_even(z * 2^16 / q) / 2. + * Placed in vdata_b slots (free by the time Load_next_4zetas runs) to + * avoid the V20159/V2pw25/V_26/V_MKQ/V_Z0..3 constants required by the + * next Barrett-Q-reduce. V_ZETATW aliases V_ZT0 (broadcast layer 5-7). */ +#define V_ZT0 12 +#define V_ZT1 20 +#define V_ZT2 6 +#define V_ZT3 11 +#define V_ZETATW 12 + +#define vdata_a1 21 +#define vdata_a2 22 +#define vdata_a3 23 +#define vdata_a4 24 +#define vdata_b1 8 +#define vdata_b2 12 +#define vdata_b3 16 +#define vdata_b4 20 + +#define vdata_brt1 8 +#define vdata_brt2 12 +#define vdata_brt3 16 +#define vdata_brt4 20 + +#define vdata_t1 25 +#define vdata_t2 26 +#define vdata_t3 30 +#define vdata_t4 31 + +#define vresult_brt1 4 +#define vresult_brt2 9 +#define vresult_brt3 13 +#define vresult_brt4 17 +#define vresult_t1 13 +#define vresult_t2 18 +#define vresult_t3 23 +#define vresult_t4 28 + +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define zeta_tw_inp 22 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Compute r[j] and r[j+len] from computed coefficients + * r[j] + r[j+len] : V8, V12, V16, V20 (data for Barrett Q-reduce) + * r[j+len] - r[j]: V25, V26, V30, V31 (data for Barrett fqmul) + */ +.macro Compute_4Coeffs + vsubuhm vdata_t1, vdata_b1, vdata_a1 + vsubuhm vdata_t2, vdata_b2, vdata_a2 + vsubuhm vdata_t3, vdata_b3, vdata_a3 + vsubuhm vdata_t4, vdata_b4, vdata_a4 + vadduhm vdata_brt1, vdata_b1, vdata_a1 + vadduhm vdata_brt2, vdata_b2, vdata_a2 + vadduhm vdata_brt3, vdata_b3, vdata_a3 + vadduhm vdata_brt4, vdata_b4, vdata_a4 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coefficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offsets to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start, next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next +.endm + +/* + * Load coefficient vectors for r[j] (r) and r[j+len] (r'): + * Load coefficient in r' vectors from offset, R10, R17, R19 and R21 + * Load coefficient in r vectors from offset, R9, R16, R18 and R20 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + */ +.macro Load_4Rjp + lxvd2x 32+vdata_b1, rinp, b1_offset /* V8: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V12: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V16: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V20: vector for r'3 */ + + lxvd2x 32+vdata_a1, rinp, a1_offset /* V21: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V22: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V23: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V24: vector r3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start, next + Init_Coeffs_offset \start, \next + Load_4Rjp + Compute_4Coeffs +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V8, V12, V16, V20 + * r[j]: V21, V22, V23, V24 + * + * In order to do the coefficient computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x 10, 0, dup_rinp + lxvd2x 11, 10, dup_rinp + xxpermdi 32+vdata_b1, 11, 10, 3 + xxpermdi 32+vdata_a1, 11, 10, 0 + lxvd2x 10, 11, dup_rinp + lxvd2x 11, 12, dup_rinp + xxpermdi 32+vdata_b2, 11, 10, 3 + xxpermdi 32+vdata_a2, 11, 10, 0 + lxvd2x 10, 15, dup_rinp + lxvd2x 11, 16, dup_rinp + xxpermdi 32+vdata_b3, 11, 10, 3 + xxpermdi 32+vdata_a3, 11, 10, 0 + lxvd2x 10, 17, dup_rinp + lxvd2x 11, 18, dup_rinp + xxpermdi 32+vdata_b4, 11, 10, 3 + xxpermdi 32+vdata_a4, 11, 10, 0 +.endm + +.macro barrett_reduce_4x _v0, _v1, _v2, _v3 + /* Restore constant vectors + V_MKQ, V2pw25 and V_26 */ + vxor 7, 7, 7 + xxlor 32+3, 6, 6 + xxlor 32+1, 7, 7 + xxlor 32+2, 8, 8 + /* Multify Odd/Even signed halfword; + Results word bound by 2^32 in abs value. */ + vmulosh 6, vdata_brt1, V20159 + vmulesh 5, vdata_brt1, V20159 + vmulosh 11, vdata_brt2, V20159 + vmulesh 10, vdata_brt2, V20159 + vmulosh 15, vdata_brt3, V20159 + vmulesh 14, vdata_brt3, V20159 + vmulosh 19, vdata_brt4, V20159 + vmulesh 18, vdata_brt4, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V2pw25 + vadduwm 5, 5, V2pw25 + vadduwm 9, 9, V2pw25 + vadduwm 10, 10, V2pw25 + vadduwm 13, 13, V2pw25 + vadduwm 14, 14, V2pw25 + vadduwm 17, 17, V2pw25 + vadduwm 18, 18, V2pw25 + /* Right shift and pack lower halfword, + results bound by 2^16 in abs value */ + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + /* Modulo multiply-Low unsigned halfword; + results bound by 2^16 * q in abs value. */ + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +/* + * Barrett multiplication for the per-layer fqmul (4 lanes). + * For each lane (b in vdata_t_i, z in _vz_i, zt in _vzt_i): + * t = vmhraddshs(b, zt, 0) = round(b*zt / 2^15) + * b_lo = vmladduhm(b, z, 0) = (b*z) mod 2^16 + * vo = vmladduhm(t, -q, b_lo) = b*z - t*q (mod 2^16) + * + * Yields the signed canonical representative of (b*z) mod q, + * bounded by q/2. V_ZERO is the zero vector set at function entry. + */ +.macro barrett_fqmul_4x _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3, _vo0, _vo1, _vo2, _vo3 + vmhraddshs 14, vdata_t1, \_vzt0, V_ZERO + vmhraddshs 19, vdata_t2, \_vzt1, V_ZERO + vmhraddshs 24, vdata_t3, \_vzt2, V_ZERO + vmhraddshs 29, vdata_t4, \_vzt3, V_ZERO + + vmladduhm \_vo0, vdata_t1, \_vz0, V_ZERO + vmladduhm \_vo1, vdata_t2, \_vz1, V_ZERO + vmladduhm \_vo2, vdata_t3, \_vz2, V_ZERO + vmladduhm \_vo3, vdata_t4, \_vz3, V_ZERO + + vmladduhm \_vo0, 14, V_NMKQ, \_vo0 + vmladduhm \_vo1, 19, V_NMKQ, \_vo1 + vmladduhm \_vo2, 24, V_NMKQ, \_vo2 + vmladduhm \_vo3, 29, V_NMKQ, \_vo3 +.endm + +.macro Load_next_4zetas + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 8, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + lxvd2x 32+V_ZT0, 0, zeta_tw_inp + lxvd2x 32+V_ZT1, 8, zeta_tw_inp + lxvd2x 32+V_ZT2, 11, zeta_tw_inp + lxvd2x 32+V_ZT3, 12, zeta_tw_inp + addi zeta_inp, zeta_inp, 64 + addi zeta_tw_inp, zeta_tw_inp, 64 +.endm + +.macro Write_B4C _vs0, _vs1, _vs2, _vs3 + stxvd2x \_vs0, rinp, a1_offset + stxvd2x \_vs1, rinp, a2_offset + stxvd2x \_vs2, rinp, a3_offset + stxvd2x \_vs3, rinp, a4_offset +.endm + +.macro Write_M4C _vs0, _vs1, _vs2, _vs3 + stxvd2x \_vs0, rinp, b1_offset + stxvd2x \_vs1, rinp, b2_offset + stxvd2x \_vs2, rinp, b3_offset + stxvd2x \_vs3, rinp, b4_offset +.endm + +.macro Reload_4coeffs + lxvd2x 32+vdata_t1, 0, rinp + lxvd2x 32+vdata_t2, 10, rinp + lxvd2x 32+vdata_t3, 11, rinp + lxvd2x 32+vdata_t4, 12, rinp + addi rinp, rinp, 64 +.endm + +.macro MWrite_8X _vs0, _vs1, _vs2, _vs3, _vs4, _vs5, _vs6, _vs7 + addi rinp, rinp, -128 + stxvd2x \_vs0, 0, rinp + stxvd2x \_vs1, 10, rinp + stxvd2x \_vs2, 11, rinp + stxvd2x \_vs3, 12, rinp + stxvd2x \_vs4, 15, rinp + stxvd2x \_vs5, 16, rinp + stxvd2x \_vs6, 17, rinp + stxvd2x \_vs7, 18, rinp + addi rinp, rinp, 128 +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the original + * coefficient array order. + */ +.macro PermWriteL44 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + xxpermdi 32+10, 32+14, 32+vresult_t1, 3 + xxpermdi 32+11, 32+14, 32+vresult_t1, 0 + xxpermdi 32+12, 32+19, 32+vresult_t2, 3 + xxpermdi 32+13, 32+19, 32+vresult_t2, 0 + xxpermdi 32+14, 32+24, 32+vresult_t3, 3 + xxpermdi 32+15, 32+24, 32+vresult_t3, 0 + xxpermdi 32+16, 32+29, 32+vresult_t4, 3 + xxpermdi 32+17, 32+29, 32+vresult_t4, 0 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the original + * coefficient array order. + */ +.macro PermWriteL24 + xxlor 32+14, 10, 10 + xxlor 32+19, 11, 11 + xxlor 32+24, 12, 12 + xxlor 32+29, 13, 13 + vmrgew 10, vresult_t1, 14 + vmrgow 11, vresult_t1, 14 + vmrgew 12, vresult_t2, 19 + vmrgow 13, vresult_t2, 19 + vmrgew 14, vresult_t3, 24 + vmrgow 15, vresult_t3, 24 + vmrgew 16, vresult_t4, 29 + vmrgow 17, vresult_t4, 29 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +/* + * INTT layer 1, Len=2. + */ +.macro intt_layer1 + Load_L24Coeffs + Compute_4Coeffs + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ + Load_next_4zetas + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + PermWriteL24 +.endm + +/* + * INTT layer 2, Len=4. + */ +.macro intt_layer2 + Load_L44Coeffs + Compute_4Coeffs + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + xxlor 10, 32+vresult_brt1, 32+vresult_brt1 + xxlor 11, 32+vresult_brt2, 32+vresult_brt2 + xxlor 12, 32+vresult_brt3, 32+vresult_brt3 + xxlor 13, 32+vresult_brt4, 32+vresult_brt4 + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ + Load_next_4zetas + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + PermWriteL44 +.endm + +/* + * INTT layer 3 and 4, Len=8 and 16. + */ +.macro intt_layer34 start, next + Load_4Coeffs \start, \next + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ + Load_next_4zetas + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + Write_M4C 32+vresult_t1, 32+vresult_t2, 32+vresult_t3, 32+vresult_t4 +.endm + +/* + * INTT layer 5, 6 and 7, Len=32, 64 and 128. + */ +.macro intt_layer567 start, next + Load_4Coeffs \start, \next + barrett_reduce_4x vresult_brt1, vresult_brt2, vresult_brt3, vresult_brt4 + Write_B4C 32+vresult_brt1, 32+vresult_brt2, 32+vresult_brt3, 32+vresult_brt4 + xxlor 32+V_NMKQ, 0, 0 /* restore V_NMKQ */ + xxlor 32+V_ZERO, 3, 3 /* restore V_ZERO from vs3 */ + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + barrett_fqmul_4x V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW, vresult_t1, vresult_t2, vresult_t3, vresult_t4 + Write_M4C 32+vresult_t1, 32+vresult_t2, 32+vresult_t3, 32+vresult_t4 +.endm + +/* + * mlk_intt_ppc_asm(int16_t *r, int16_t *qdata) + * Compute inverse NTT based on the following 7 layers - + * len = 2, 4, 8, 16, 32, 64, 128 + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coefficients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + */ +.global MLK_ASM_NAMESPACE(intt_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(intt_ppc_asm) + + SAVE_REGS + + /* init vectors and constants */ + lxvx 0, 0, qinp /* -Q */ + + xxlxor 32+V_ZERO, 32+V_ZERO, 32+V_ZERO + xxlor 3, 32+V_ZERO, 32+V_ZERO /* save zero vector to vs3 */ + + /* Setup for Barrett reduce */ + li 10, MLK_PPC_Q_OFFSET + li 11, MLK_PPC_C20159_OFFSET + lxvx 6, 10, qinp /* V_MKQ */ + lxvx 32+V20159, 11, qinp /* V20159 */ + + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 32+8, 32+8 /* V_26 store at vs8 */ + + vspltisw 9, 1 + vsubuwm 10, 8, 9 /* value 25 */ + vslw 9, 9, 10 + xxlor 7, 32+9, 32+9 /* V2pw25 store at vs7 */ + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + + /* + * Scale every coefficient by N^-1 via Barrett multiplication. + */ + addi zeta_inp, qinp, MLK_PPC_N_INV_OFFSET + lvx V_NINV, 0, zeta_inp + addi zeta_inp, qinp, MLK_PPC_N_INV_TW_OFFSET + lvx V_NINV_TW, 0, zeta_inp + li 8, 4 + mtctr 8 + + xxlor 32+V_NMKQ, 0, 0 /* V_NMKQ = -Q */ +intt_ppc_asm_Loopf: + Reload_4coeffs + barrett_fqmul_4x V_NINV, V_NINV, V_NINV, V_NINV, V_NINV_TW, V_NINV_TW, V_NINV_TW, V_NINV_TW, 6, 7, 8, 9 + Reload_4coeffs + barrett_fqmul_4x V_NINV, V_NINV, V_NINV, V_NINV, V_NINV_TW, V_NINV_TW, V_NINV_TW, V_NINV_TW, 13, 18, 23, 28 + MWrite_8X 32+6, 32+7, 32+8, 32+9, 32+13, 32+18, 32+23, 32+28 + bdnz intt_ppc_asm_Loopf + + addi rinp, rinp, -512 + +.balign 16 + /* + * Layer 1. len = 2 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Update zetas vectors, each vector has 2 zetas + * Load zeta vectors in 2-2-2-2 layout + */ + addi zeta_inp, qinp, MLK_PPC_ZETA_INTT_OFFSET + addi zeta_tw_inp, qinp, MLK_PPC_ZETA_INTT_TW_OFFSET + li len_2, 4 /* len * 2 */ + mr dup_rinp, rinp + + intt_layer1 + addi dup_rinp, dup_rinp, 128 + intt_layer1 + addi dup_rinp, dup_rinp, 128 + intt_layer1 + addi dup_rinp, dup_rinp, 128 + intt_layer1 + addi dup_rinp, dup_rinp, 128 + +.balign 16 + /* + * Layer 2. len = 4 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 4-4 layout + */ + mr dup_rinp, rinp + li len_2, 8 + + intt_layer2 + addi dup_rinp, dup_rinp, 128 + intt_layer2 + addi dup_rinp, dup_rinp, 128 + intt_layer2 + addi dup_rinp, dup_rinp, 128 + intt_layer2 + addi dup_rinp, dup_rinp, 128 + +.balign 16 + /* + * Layer 3. len = 8, start = 0, 128, 256, 384 + */ + li len_2, 16 + + intt_layer34 0, 32 + intt_layer34 128, 32 + intt_layer34 256, 32 + intt_layer34 384, 32 + +.balign 16 + /* + * Layer 4. len = 16, start = 0, 16, 256, 272 + */ + li len_2, 32 + + intt_layer34 0, 64 + + addi zeta_inp, zeta_inp, -64 + addi zeta_tw_inp, zeta_tw_inp, -64 + intt_layer34 16, 64 + + intt_layer34 256, 64 + + addi zeta_inp, zeta_inp, -64 + addi zeta_tw_inp, zeta_tw_inp, -64 + intt_layer34 272, 64 + +.balign 16 + /* + * Layer 5. len = 32, start = 0, 128, 256, 384 + */ + li len_2, 64 + + intt_layer567 0, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + intt_layer567 128, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + intt_layer567 256, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + intt_layer567 384, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + +.balign 16 + /* + * Layer 6. len = 64, start = 0, 64, 256, 320 + */ + li len_2, 128 + + intt_layer567 0, 16 + intt_layer567 64, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + intt_layer567 256, 16 + intt_layer567 320, 16 + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + +.balign 16 + /* + * Layer 7. len = 128, start = 0, 64, 128, 192 + */ + li len_2, 256 /* len*2 */ + + intt_layer567 0, 16 + intt_layer567 64, 16 + intt_layer567 128, 16 + intt_layer567 192, 16 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V2pw25 +#undef V_26 +#undef V_MKQ +#undef V_ZERO +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V_NINV +#undef V_NINV_TW +#undef V_ZT0 +#undef V_ZT1 +#undef V_ZT2 +#undef V_ZT3 +#undef V_ZETATW +#undef vdata_a1 +#undef vdata_a2 +#undef vdata_a3 +#undef vdata_a4 +#undef vdata_b1 +#undef vdata_b2 +#undef vdata_b3 +#undef vdata_b4 +#undef vdata_brt1 +#undef vdata_brt2 +#undef vdata_brt3 +#undef vdata_brt4 +#undef vdata_t1 +#undef vdata_t2 +#undef vdata_t3 +#undef vdata_t4 +#undef vresult_brt1 +#undef vresult_brt2 +#undef vresult_brt3 +#undef vresult_brt4 +#undef vresult_t1 +#undef vresult_t2 +#undef vresult_t3 +#undef vresult_t4 +#undef rinp +#undef dup_rinp +#undef qinp +#undef len_2 +#undef zeta_inp +#undef zeta_tw_inp +#undef a1_offset +#undef a2_offset +#undef a3_offset +#undef a4_offset +#undef b1_offset +#undef b2_offset +#undef b3_offset +#undef b4_offset + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/ntt_ppc_asm.S b/dev/ppc64le/src/ntt_ppc_asm.S new file mode 100644 index 0000000000..c28783881c --- /dev/null +++ b/dev/ppc64le/src/ntt_ppc_asm.S @@ -0,0 +1,666 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) +/* simpasm: header-end */ + +#include "consts.h" + +#define V_ZERO 3 +#define V_NMKQ 5 +#define V_Z0 7 +#define V_Z1 8 +#define V_Z2 9 +#define V_Z3 10 +#define V_ZETA 10 +#define V_ZT0 0 +#define V_ZT1 1 +#define V_ZT2 6 +#define V_ZT3 11 +#define V_ZETATW 2 + +#define vdata_a1 12 +#define vdata_a2 17 +#define vdata_a3 22 +#define vdata_a4 27 +#define vdata_b1 13 +#define vdata_b2 18 +#define vdata_b3 23 +#define vdata_b4 28 + +#define vresult_a1 15 +#define vresult_b1 16 +#define vresult_a2 20 +#define vresult_b2 21 +#define vresult_a3 25 +#define vresult_b3 26 +#define vresult_a4 30 +#define vresult_b4 31 + +#define rinp 3 +#define dup_rinp 5 +#define qinp 4 +#define len_2 7 +#define zeta_inp 14 +#define zeta_tw_inp 22 +#define a1_offset 9 +#define a2_offset 16 +#define a3_offset 18 +#define a4_offset 20 +#define b1_offset 10 +#define b2_offset 17 +#define b3_offset 19 +#define b4_offset 21 + +.text + +.macro SAVE_REGS + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvx 32+20, 10, 1 + stxvx 32+21, 11, 1 + stxvx 32+22, 12, 1 + stxvx 32+23, 14, 1 + stxvx 32+24, 15, 1 + stxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvx 32+26, 10, 1 + stxvx 32+27, 11, 1 + stxvx 32+28, 12, 1 + stxvx 32+29, 14, 1 + stxvx 32+30, 15, 1 + stxvx 32+31, 16, 1 +.endm + +.macro RESTORE_REGS + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvx 32+20, 10, 1 + lxvx 32+21, 11, 1 + lxvx 32+22, 12, 1 + lxvx 32+23, 14, 1 + lxvx 32+24, 15, 1 + lxvx 32+25, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvx 32+26, 10, 1 + lxvx 32+27, 11, 1 + lxvx 32+28, 12, 1 + lxvx 32+29, 14, 1 + lxvx 32+30, 15, 1 + lxvx 32+31, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + + mtlr 0 + addi 1, 1, 352 +.endm + +/* + * Init_Coeffs_offset: initial offset setup for the coefficient array. + * + * start: beginning of the offset to the coefficient array. + * next: Next offset. + * len: Index difference between coefficients. + * + * r7: len * 2, each coefficient component is 2 bytes. + * + * registers used for offset to coefficients, r[j] and r[j+len] + * R9: offset to r0 = j + * R16: offset to r1 = r0 + next + * R18: offset to r2 = r1 + next + * R20: offset to r3 = r2 + next + * + * R10: offset to r'0 = r0 + len*2 + * R17: offset to r'1 = r'0 + step + * R19: offset to r'2 = r'1 + step + * R21: offset to r'3 = r'2 + step + * + */ +.macro Init_Coeffs_offset start, next + li a1_offset, \start /* first offset to j */ + add b1_offset, len_2, a1_offset /* J + len*2 */ + addi a2_offset, a1_offset, \next + addi b2_offset, b1_offset, \next + addi a3_offset, a2_offset, \next + addi b3_offset, b2_offset, \next + addi a4_offset, a3_offset, \next + addi b4_offset, b3_offset, \next +.endm + +/* + * Load coefficient in r[j+len] (r') vectors from offset, R10, R17, R19 and R21 + * r[j+len]: V13, V18, V23, V28 + */ +.macro Load_4Rjp + lxvd2x 32+vdata_b1, rinp, b1_offset /* V13: vector r'0 */ + lxvd2x 32+vdata_b2, rinp, b2_offset /* V18: vector for r'1 */ + lxvd2x 32+vdata_b3, rinp, b3_offset /* V23: vector for r'2 */ + lxvd2x 32+vdata_b4, rinp, b4_offset /* V28: vector for r'3 */ +.endm + +/* + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rjlen0, rjlen1, rjlen2, rjlen3, rjlen4, rjlen5, rjlen6, rjlen7 + */ +.macro Load_4Coeffs start, next + Init_Coeffs_offset \start, \next + Load_4Rjp +.endm + +/* + * Load 2 - 2 - 2 - 2 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rjlen2, rjlen3, rj4, rj5, rjlen6, arlen7 + * rj8, rj9, rjlen10, rjlen11, rj12, rj13, rjlen14, rjlen15 + * Each vmrgew and vmrgow will transpose vectors as, + * r[j]= rj0, rj1, rj8, rj9, rj4, rj5, rj12, rj13 + * r[j+len]= rjlen2, rjlen3, rjlen10, rjlen11, rjlen6, arlen7, rjlen14, rjlen15 + * + * r[j+len]: V13, V18, V23, V28 + * r[j]: V12, V17, V22, V27 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L24Coeffs + lxvd2x 32+25, 0, dup_rinp + lxvd2x 32+26, 10, dup_rinp + vmrgew vdata_b1, 25, 26 + vmrgow vdata_a1, 25, 26 + lxvd2x 32+25, 11, dup_rinp + lxvd2x 32+26, 12, dup_rinp + vmrgew vdata_b2, 25, 26 + vmrgow vdata_a2, 25, 26 + lxvd2x 32+25, 15, dup_rinp + lxvd2x 32+26, 16, dup_rinp + vmrgew vdata_b3, 25, 26 + vmrgow vdata_a3, 25, 26 + lxvd2x 32+25, 17, dup_rinp + lxvd2x 32+26, 18, dup_rinp + vmrgew vdata_b4, 25, 26 + vmrgow vdata_a4, 25, 26 +.endm + +/* + * Load 4 - 4 layout + * + * Load Coefficients and setup vectors for 8 coefficients in the + * following order, + * rj0, rj1, rj2, rj3, rjlen4, rjlen5, rjlen6, rjlen7 + * rj8, rj9, rj10, rj11, rjlen12, rjlen13, rjlen14, rjlen15 + * + * Each xxpermdi will transpose vectors as, + * rjlen4, rjlen5, rjlen6, rjlen7, rjlen12, rjlen13, rjlen14, rjlen15 + * rj0, rj1, rj2, rj3, rj8, rj9, rj10, rj11 + * + * In order to do the coefficients computation, zeta vector will arrange + * in the proper order to match the multiplication. + */ +.macro Load_L44Coeffs + lxvd2x 1, 0, dup_rinp + lxvd2x 2, 10, dup_rinp + xxpermdi 32+vdata_b1, 2, 1, 3 + xxpermdi 32+vdata_a1, 2, 1, 0 + lxvd2x 3, 11, dup_rinp + lxvd2x 4, 12, dup_rinp + xxpermdi 32+vdata_b2, 4, 3, 3 + xxpermdi 32+vdata_a2, 4, 3, 0 + lxvd2x 1, 15, dup_rinp + lxvd2x 2, 16, dup_rinp + xxpermdi 32+vdata_b3, 2, 1, 3 + xxpermdi 32+vdata_a3, 2, 1, 0 + lxvd2x 3, 17, dup_rinp + lxvd2x 4, 18, dup_rinp + xxpermdi 32+vdata_b4, 4, 3, 3 + xxpermdi 32+vdata_a4, 4, 3, 0 +.endm + +/* + * Barrett multiplication (4 lanes). + * For each lane (b in vdata_b_i, z in _vz_i, zt in _vzt_i): + * t = vmhraddshs(b, zt, 0) = round(b*zt / 2^15) + * b_lo = vmladduhm(b, z, 0) = (b*z) mod 2^16 + * vdata_b_i = vmladduhm(t, -q, b_lo) = b*z - t*q (mod 2^16) + * + * Computes (b*z) mod q in signed representation; the output + * is in the range (-q, q). + */ +.macro barrett_fqmul_4x _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3 + vmhraddshs vresult_a1, vdata_b1, \_vzt0, V_ZERO + vmhraddshs vresult_a2, vdata_b2, \_vzt1, V_ZERO + vmhraddshs vresult_a3, vdata_b3, \_vzt2, V_ZERO + vmhraddshs vresult_a4, vdata_b4, \_vzt3, V_ZERO + + vmladduhm vdata_b1, vdata_b1, \_vz0, V_ZERO + vmladduhm vdata_b2, vdata_b2, \_vz1, V_ZERO + vmladduhm vdata_b3, vdata_b3, \_vz2, V_ZERO + vmladduhm vdata_b4, vdata_b4, \_vz3, V_ZERO + + vmladduhm vdata_b1, vresult_a1, V_NMKQ, vdata_b1 + vmladduhm vdata_b2, vresult_a2, V_NMKQ, vdata_b2 + vmladduhm vdata_b3, vresult_a3, V_NMKQ, vdata_b3 + vmladduhm vdata_b4, vresult_a4, V_NMKQ, vdata_b4 +.endm + +/* + * Load 4 r[j] (r) coefficient vectors: + * Load coefficient in vectors from offset, R9, R16, R18 and R20 + * r[j]: V12, V17, V22, V27 + */ +.macro Load_4Rj + lxvd2x 32+vdata_a1, rinp, a1_offset /* V12: vector r0 */ + lxvd2x 32+vdata_a2, rinp, a2_offset /* V17: vector r1 */ + lxvd2x 32+vdata_a3, rinp, a3_offset /* V22: vector r2 */ + lxvd2x 32+vdata_a4, rinp, a4_offset /* V27: vector r3 */ +.endm + +/* + * Compute final final r[j] and r[j+len] + * final r[j+len]: V16, V21, V26, V31 + * final r[j]: V15, V20, V25, V30 + */ +.macro Compute_4Coeffs + /* Since the result of the Barrett multiplication is bounded + by q/2 in absolute value. + Finally to complete the final update of the results with add/sub + r[j] = r[j] + t. + r[j+len] = r[j] - t + */ + vsubuhm vresult_b1, vdata_a1, vdata_b1 + vadduhm vresult_a1, vdata_b1, vdata_a1 + vsubuhm vresult_b2, vdata_a2, vdata_b2 + vadduhm vresult_a2, vdata_b2, vdata_a2 + vsubuhm vresult_b3, vdata_a3, vdata_b3 + vadduhm vresult_a3, vdata_b3, vdata_a3 + vsubuhm vresult_b4, vdata_a4, vdata_b4 + vadduhm vresult_a4, vdata_b4, vdata_a4 +.endm + +.macro Write_One + stxvd2x 32+vresult_a1, rinp, a1_offset + stxvd2x 32+vresult_b1, rinp, b1_offset + stxvd2x 32+vresult_a2, rinp, a2_offset + stxvd2x 32+vresult_b2, rinp, b2_offset + stxvd2x 32+vresult_a3, rinp, a3_offset + stxvd2x 32+vresult_b3, rinp, b3_offset + stxvd2x 32+vresult_a4, rinp, a4_offset + stxvd2x 32+vresult_b4, rinp, b4_offset +.endm + +/* + * Transpose the final coefficients of 4-4 layout to the original + * coefficient array order. + */ +.macro PermWriteL44 + Compute_4Coeffs + xxpermdi 0, 32+vresult_a1, 32+vresult_b1, 3 + xxpermdi 1, 32+vresult_a1, 32+vresult_b1, 0 + xxpermdi 2, 32+vresult_a2, 32+vresult_b2, 3 + xxpermdi 3, 32+vresult_a2, 32+vresult_b2, 0 + xxpermdi 4, 32+vresult_a3, 32+vresult_b3, 3 + xxpermdi 5, 32+vresult_a3, 32+vresult_b3, 0 + xxpermdi 6, 32+vresult_a4, 32+vresult_b4, 3 + xxpermdi 7, 32+vresult_a4, 32+vresult_b4, 0 + stxvd2x 0, 0, dup_rinp + stxvd2x 1, 10, dup_rinp + stxvd2x 2, 11, dup_rinp + stxvd2x 3, 12, dup_rinp + stxvd2x 4, 15, dup_rinp + stxvd2x 5, 16, dup_rinp + stxvd2x 6, 17, dup_rinp + stxvd2x 7, 18, dup_rinp +.endm + +/* + * Transpose the final coefficients of 2-2-2-2 layout to the original + * coefficient array order. + */ +.macro PermWriteL24 + Compute_4Coeffs + vmrgew 10, vresult_b1, vresult_a1 + vmrgow 11, vresult_b1, vresult_a1 + vmrgew 12, vresult_b2, vresult_a2 + vmrgow 13, vresult_b2, vresult_a2 + vmrgew 14, vresult_b3, vresult_a3 + vmrgow 15, vresult_b3, vresult_a3 + vmrgew 16, vresult_b4, vresult_a4 + vmrgow 17, vresult_b4, vresult_a4 + stxvd2x 32+10, 0, dup_rinp + stxvd2x 32+11, 10, dup_rinp + stxvd2x 32+12, 11, dup_rinp + stxvd2x 32+13, 12, dup_rinp + stxvd2x 32+14, 15, dup_rinp + stxvd2x 32+15, 16, dup_rinp + stxvd2x 32+16, 17, dup_rinp + stxvd2x 32+17, 18, dup_rinp +.endm + +.macro Load_next_4zetas + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 32+V_Z0, 0, zeta_inp + lxvd2x 32+V_Z1, 10, zeta_inp + lxvd2x 32+V_Z2, 11, zeta_inp + lxvd2x 32+V_Z3, 12, zeta_inp + lxvd2x 32+V_ZT0, 0, zeta_tw_inp + lxvd2x 32+V_ZT1, 10, zeta_tw_inp + lxvd2x 32+V_ZT2, 11, zeta_tw_inp + lxvd2x 32+V_ZT3, 12, zeta_tw_inp + addi zeta_inp, zeta_inp, 64 + addi zeta_tw_inp, zeta_tw_inp, 64 +.endm + +/* + * NTT layer 7, Len=2. + */ +.macro ntt_layer7 + Load_next_4zetas + Load_L24Coeffs + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + PermWriteL24 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT layer 6, Len=4. + */ +.macro ntt_layer6 + Load_next_4zetas + Load_L44Coeffs + barrett_fqmul_4x V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + PermWriteL44 + addi dup_rinp, dup_rinp, 128 +.endm + +/* + * NTT other layers, 1, 2, 3, 4, 5. + */ +.macro ntt_layer12345 start, next, _vz0, _vz1, _vz2, _vz3, _vzt0, _vzt1, _vzt2, _vzt3 + Load_4Coeffs \start, \next + barrett_fqmul_4x \_vz0, \_vz1, \_vz2, \_vz3, \_vzt0, \_vzt1, \_vzt2, \_vzt3 + Load_4Rj + Compute_4Coeffs + Write_One +.endm + +/* + * mlk_ntt_ppc_asm(int16_t *r, int16_t *qdata) + * Compute forward NTT based on the following 7 layers - + * len = 128, 64, 32, 16, 8, 4, 2. + * + * Each layer compute the coefficients on 2 legs, start and start + len*2 offsets. + * + * leg 1 leg 2 + * ----- ----- + * start start+len*2 + * start+next start+len*2+next + * start+next+next start+len*2+next+next + * start+next+next+next start+len*2+next+next+next + * + * Each computation loads 8 vectors, 4 for each leg. + * The final coefficient (t) from each vector of leg1 and leg2 then do the + * add/sub operations to obtain the final results. + * + * -> leg1 = leg1 + t, leg2 = leg1 - t + * + * The resulting coefficients then store back to each leg's offset. + * + * Each vector has the same corresponding zeta except len=4 and len=2. + * + * len=4 has 4-4 layout which means every 4 16-bit coefficients has the same zeta. + * and len=2 has 2-2-2-2 layout which means every 2 16-bit coefficients has the same zeta. + * e.g. + * coeff vector a1 a2 a3 a4 a5 a6 a7 a8 + * zeta vector z1 z1 z2 z2 z3 z3 z4 z4 + * + * For len=4 and len=2, each vector will get permuted to leg1 and leg2. Zeta is + * pre-arranged for the leg1 and leg2. After the computation, each vector needs + * to transpose back to its original 4-4 or 2-2-2-2 layout. + * + */ +.global MLK_ASM_NAMESPACE(ntt_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(ntt_ppc_asm) + + SAVE_REGS + + /* load -MLKEM_Q */ + lvx V_NMKQ,0,qinp + + /* zeta_inp: regular Barrett roots; zeta_tw_inp: twisted roots */ + addi zeta_inp, qinp, MLK_PPC_ZETA_NTT_OFFSET + addi zeta_tw_inp, qinp, MLK_PPC_ZETA_NTT_TW_OFFSET + + vxor V_ZERO, V_ZERO, V_ZERO + +.balign 16 + /* + * Layer 1. len = 128, start= 0, 64, 128, 192 + */ + li len_2, 256 /* len * 2 */ + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 192, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + +.balign 16 + /* + * Layer 2. len = 64, start= 0, 64, 256, 320 + */ + li len_2, 128 + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 64, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + ntt_layer12345 320, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + +.balign 16 + /* + * Layer 3. len = 32, start = 0, 128, 256, 384 + */ + li len_2, 64 + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 0, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 128, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 256, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + + lvx V_ZETA, 0, zeta_inp + lvx V_ZETATW, 0, zeta_tw_inp + addi zeta_inp, zeta_inp, 16 + addi zeta_tw_inp, zeta_tw_inp, 16 + ntt_layer12345 384, 16, V_ZETA, V_ZETA, V_ZETA, V_ZETA, V_ZETATW, V_ZETATW, V_ZETATW, V_ZETATW + +.balign 16 + /* + * Layer 4. len = 16, start = 0, 16, 256, 272 + */ + li len_2, 32 + Load_next_4zetas + ntt_layer12345 0, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + ntt_layer12345 16, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + + Load_next_4zetas + ntt_layer12345 256, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + ntt_layer12345 272, 64, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + +.balign 16 + /* + * Layer 5. len = 8, start= 0, 128, 256, 384 + */ + li len_2, 16 + Load_next_4zetas + ntt_layer12345 0, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + + Load_next_4zetas + ntt_layer12345 128, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + + Load_next_4zetas + ntt_layer12345 256, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + + Load_next_4zetas + ntt_layer12345 384, 32, V_Z0, V_Z1, V_Z2, V_Z3, V_ZT0, V_ZT1, V_ZT2, V_ZT3 + + /* + * Layer 6. len = 4, + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 4-4 layout + */ + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 8 + + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + +.balign 16 + ntt_layer6 + ntt_layer6 + ntt_layer6 + ntt_layer6 + + /* + * Layer 7. len = 2 + * leg1 offset - 0, 32, 64, 96 + * leg2 offset - 16, 48, 80, 112 + * + * Load zeta vectors in 2-2-2-2 layout + */ + mr dup_rinp, rinp /* Let r5 points to coefficient array */ + li len_2, 4 + +.balign 16 + ntt_layer7 + ntt_layer7 + ntt_layer7 + ntt_layer7 + + RESTORE_REGS + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V_ZERO +#undef V_NMKQ +#undef V_Z0 +#undef V_Z1 +#undef V_Z2 +#undef V_Z3 +#undef V_ZETA +#undef V_ZT0 +#undef V_ZT1 +#undef V_ZT2 +#undef V_ZT3 +#undef V_ZETATW +#undef vdata_a1 +#undef vdata_a2 +#undef vdata_a3 +#undef vdata_a4 +#undef vdata_b1 +#undef vdata_b2 +#undef vdata_b3 +#undef vdata_b4 +#undef vresult_a1 +#undef vresult_b1 +#undef vresult_a2 +#undef vresult_b2 +#undef vresult_a3 +#undef vresult_b3 +#undef vresult_a4 +#undef vresult_b4 +#undef rinp +#undef dup_rinp +#undef qinp +#undef len_2 +#undef zeta_inp +#undef zeta_tw_inp +#undef a1_offset +#undef a2_offset +#undef a3_offset +#undef a4_offset +#undef b1_offset +#undef b2_offset +#undef b3_offset +#undef b4_offset + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/poly_tomont_ppc_asm.S b/dev/ppc64le/src/poly_tomont_ppc_asm.S new file mode 100644 index 0000000000..21a1829eb6 --- /dev/null +++ b/dev/ppc64le/src/poly_tomont_ppc_asm.S @@ -0,0 +1,183 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) +/* simpasm: header-end */ + +#include "consts.h" + +#define V1353 0 +#define V_QINV 2 +#define V_NMKQ 5 + +.text + +/* + * montgomery_reduce + * t = a * QINV + * t = (a - (int32_t)t*_MLKEM_Q) >> 16 + * + *----------------------------------- + * mont_reduce_4x(_v0, _v1, _v2, _v3) + */ + +.macro mont_reduce_4x _v0, _v1, _v2, _v3 + lxvd2x 32+13, 0, 3 + addi 3, 3, 16 + lxvd2x 32+18, 0, 3 + addi 3, 3, 16 + lxvd2x 32+23, 0, 3 + addi 3, 3, 16 + lxvd2x 32+7, 0, 3 + addi 3, 3, 16 + + vmladduhm 15, 13, V1353, 3 + vmladduhm 20, 18, V1353, 3 + vmladduhm 25, 23, V1353, 3 + vmladduhm 9, 7, V1353, 3 + + vmhraddshs 14, 13, V1353, 3 + vmhraddshs 19, 18, V1353, 3 + vmhraddshs 24, 23, V1353, 3 + vmhraddshs 8, 7, V1353, 3 + + vmladduhm 15, 15, V_QINV, 3 + vmladduhm 20, 20, V_QINV, 3 + vmladduhm 25, 25, V_QINV, 3 + vmladduhm 9, 9, V_QINV, 3 + + vmhraddshs 15, 15, V_NMKQ, 14 + vmhraddshs 20, 20, V_NMKQ, 19 + vmhraddshs 25, 25, V_NMKQ, 24 + vmhraddshs 9, 9, V_NMKQ, 8 + + vsrah \_v0, 15, 4 + vsrah \_v1, 20, 4 + vsrah \_v2, 25, 4 + vsrah \_v3, 9, 4 +.endm + +.macro Write_8X + stxvd2x 32+27, 4, 3 + stxvd2x 32+28, 5, 3 + stxvd2x 32+29, 6, 3 + stxvd2x 32+30, 7, 3 + stxvd2x 32+13, 8, 3 + stxvd2x 32+18, 9, 3 + stxvd2x 32+23, 10, 3 + stxvd2x 32+7, 11, 3 +.endm + +.global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) + stdu 1, -320(1) + mflr 0 + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + stxvx 32+25, 11, 1 + stxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvx 32+27, 6, 1 + stxvx 32+28, 7, 1 + stxvx 32+29, 8, 1 + stxvx 32+30, 9, 1 + + li 6, MLK_PPC_NQ_OFFSET + li 7, MLK_PPC_QINV_OFFSET + li 8, MLK_PPC_C1353_OFFSET + lxvx 32+V_NMKQ, 6, 4 + lxvx 32+V_QINV, 7, 4 + lxvx 32+V1353, 8, 4 + + vxor 3, 3, 3 + vspltish 4, 1 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 + Write_8X + + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 + Write_8X + + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 + Write_8X + + mont_reduce_4x 27, 28, 29, 30 + mont_reduce_4x 13, 18, 23, 7 + Write_8X + + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + lxvx 32+25, 11, 1 + lxvx 32+26, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvx 32+27, 6, 1 + lxvx 32+28, 7, 1 + lxvx 32+29, 8, 1 + lxvx 32+30, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V1353 +#undef V_QINV +#undef V_NMKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/dev/ppc64le/src/reduce_ppc_asm.S b/dev/ppc64le/src/reduce_ppc_asm.S new file mode 100644 index 0000000000..e689fdeeec --- /dev/null +++ b/dev/ppc64le/src/reduce_ppc_asm.S @@ -0,0 +1,229 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) +/* simpasm: header-end */ + +#include "consts.h" + +/* + * poly_reduce: Applies Barrett reduction to all coefficients of a polynomial + * for details of the Barrett reduction + * + * Arguments: *r: pointer to input/output polynomial + */ + +/* Barrett reduce constants */ +#define V20159 0 +#define V_25 1 +#define V_26 2 +#define V_MKQ 3 + +.text + +.macro barrett_reduce_4x _v0, _v1, _v2, _v3 + lxvd2x 32+8, 0, 3 + lxvd2x 32+12, 14, 3 + lxvd2x 32+16, 15, 3 + lxvd2x 32+20, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, V20159 + vmulesh 5, 8, V20159 + vmulosh 11, 12, V20159 + vmulesh 10, 12, V20159 + vmulosh 15, 16, V20159 + vmulesh 14, 16, V20159 + vmulosh 19, 20, V20159 + vmulesh 18, 20, V20159 + xxmrglw 32+4, 32+5, 32+6 + xxmrghw 32+5, 32+5, 32+6 + xxmrglw 32+9, 32+10, 32+11 + xxmrghw 32+10, 32+10, 32+11 + xxmrglw 32+13, 32+14, 32+15 + xxmrghw 32+14, 32+14, 32+15 + xxmrglw 32+17, 32+18, 32+19 + xxmrghw 32+18, 32+18, 32+19 + vadduwm 4, 4, V_25 + vadduwm 5, 5, V_25 + vadduwm 9, 9, V_25 + vadduwm 10, 10, V_25 + vadduwm 13, 13, V_25 + vadduwm 14, 14, V_25 + vadduwm 17, 17, V_25 + vadduwm 18, 18, V_25 + vsraw 4, 4, V_26 + vsraw 5, 5, V_26 + vsraw 9, 9, V_26 + vsraw 10, 10, V_26 + vsraw 13, 13, V_26 + vsraw 14, 14, V_26 + vsraw 17, 17, V_26 + vsraw 18, 18, V_26 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm \_v0, 4, V_MKQ, 8 + vmladduhm \_v1, 9, V_MKQ, 12 + vmladduhm \_v2, 13, V_MKQ, 16 + vmladduhm \_v3, 17, V_MKQ, 20 +.endm + +.macro Write_8X + stxvd2x 32+21, 4, 3 + stxvd2x 32+22, 5, 3 + stxvd2x 32+23, 6, 3 + stxvd2x 32+24, 7, 3 + stxvd2x 32+4, 8, 3 + stxvd2x 32+9, 9, 3 + stxvd2x 32+13, 10, 3 + stxvd2x 32+17, 11, 3 +.endm + +/* + * Conditional addition to get unsigned canonical representative + */ +.macro To_unsigned_16 + lxvd2x 32+12, 0, 3 + lxvd2x 32+13, 14, 3 + lxvd2x 32+14, 15, 3 + lxvd2x 32+15, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 32+1, 32+7,32+12, 32+1 + xxsel 32+0, 32+8,32+13, 32+0 + xxsel 32+3, 32+5,32+14, 32+3 + xxsel 32+2, 32+6,32+15, 32+2 + stxvd2x 32+3, 10, 3 + stxvd2x 32+2, 11, 3 + stxvd2x 32+1, 8, 3 + stxvd2x 32+0, 9, 3 +.endm + +.global MLK_ASM_NAMESPACE(reduce_ppc_asm) +.balign 16 +MLK_ASM_FN_SYMBOL(reduce_ppc_asm) + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvx 32+20, 6, 1 + stxvx 32+21, 7, 1 + stxvx 32+22, 8, 1 + stxvx 32+23, 9, 1 + stxvx 32+24, 10, 1 + + vxor 7, 7, 7 + + li 6, MLK_PPC_Q_OFFSET + li 7, MLK_PPC_C20159_OFFSET + lxvx 32+V_MKQ, 6, 4 + lxvx 32+V20159, 7, 4 + + vspltisw V_26, 13 + vadduwm V_26, V_26, V_26 + vspltisw 4, 1 + vsubuwm 5, V_26, 4 + vslw V_25, 4, 5 + + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + + li 14, 16 + li 15, 32 + li 16, 48 + + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 + Write_8X + + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 + Write_8X + + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 + Write_8X + + barrett_reduce_4x 21, 22, 23, 24 + barrett_reduce_4x 4, 9, 13, 17 + Write_8X + +.balign 16 + /* + * To unsigned canonical + */ + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, V_MKQ + + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + To_unsigned_16 + + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvx 32+20, 6, 1 + lxvx 32+21, 7, 1 + lxvx 32+22, 8, 1 + lxvx 32+23, 9, 1 + lxvx 32+24, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + +/* To facilitate single-compilation-unit (SCU) builds, undefine all macros. + * Don't modify by hand -- this is auto-generated by scripts/autogen. */ +#undef V20159 +#undef V_25 +#undef V_26 +#undef V_MKQ + +/* simpasm: footer-start */ +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ diff --git a/flake.nix b/flake.nix index 31483560ae..660577dc3d 100644 --- a/flake.nix +++ b/flake.nix @@ -114,13 +114,13 @@ packages = builtins.attrValues { inherit (config.packages) linters hol_light s2n_bignum hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchains hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchains toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-aarch64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_aarch64 toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.hol_light-cross-x86_64 = (util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 hol_light s2n_bignum gcc-arm-embedded hol_server; }; + packages = builtins.attrValues { inherit (config.packages) linters toolchain_x86_64 toolchain_ppc64le hol_light s2n_bignum gcc-arm-embedded hol_server; }; }).overrideAttrs (old: { shellHook = holLightShellHook; }); devShells.ci = util.mkShell { packages = builtins.attrValues { inherit (config.packages) linters toolchains_native; }; @@ -158,7 +158,7 @@ # autogen shell with cross compiler for the "other" architecture devShells.cross-autogen = util.mkShell { - packages = builtins.attrValues { inherit (config.packages) linters; inherit (pkgs) gcc-arm-embedded; } + packages = builtins.attrValues { inherit (config.packages) linters toolchain_ppc64le; inherit (pkgs) gcc-arm-embedded; } ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isx86_64 [ config.packages.toolchain_aarch64 ] ++ pkgs.lib.optionals pkgs.stdenv.hostPlatform.isAarch64 [ config.packages.toolchain_x86_64 ]; }; diff --git a/mlkem/mlkem_native.c b/mlkem/mlkem_native.c index 2a38b002b6..7cef81f546 100644 --- a/mlkem/mlkem_native.c +++ b/mlkem/mlkem_native.c @@ -88,6 +88,9 @@ #include "src/native/riscv64/src/rv64v_debug.c" #include "src/native/riscv64/src/rv64v_poly.c" #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/consts.c" +#endif #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -658,5 +661,38 @@ #undef mlk_debug_check_bounds_int16m1 #undef mlk_debug_check_bounds_int16m2 #endif /* MLK_SYS_RISCV64 */ +#if defined(MLK_SYS_PPC64LE) +/* + * Undefine macros from native code (Arith, PPC64LE) + */ +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc_asm +#undef mlk_ntt_ppc_asm +#undef mlk_poly_tomont_ppc_asm +#undef mlk_reduce_ppc_asm +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef MLK_PPC_C1353_OFFSET +#undef MLK_PPC_C20159_OFFSET +#undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_N_INV_OFFSET +#undef MLK_PPC_N_INV_TW_OFFSET +#undef MLK_PPC_QINV_OFFSET +#undef MLK_PPC_Q_OFFSET +#undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_INTT_TW_OFFSET +#undef MLK_PPC_ZETA_NTT_OFFSET +#undef MLK_PPC_ZETA_NTT_TW_OFFSET +#undef mlk_ppc_qdata +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mlkem/mlkem_native_asm.S b/mlkem/mlkem_native_asm.S index 9e7dfdd040..6d9554adbd 100644 --- a/mlkem/mlkem_native_asm.S +++ b/mlkem/mlkem_native_asm.S @@ -95,6 +95,12 @@ #endif /* MLK_SYS_X86_64 */ #if defined(MLK_SYS_RISCV64) #endif +#if defined(MLK_SYS_PPC64LE) +#include "src/native/ppc64le/src/intt_ppc_asm.S" +#include "src/native/ppc64le/src/ntt_ppc_asm.S" +#include "src/native/ppc64le/src/poly_tomont_ppc_asm.S" +#include "src/native/ppc64le/src/reduce_ppc_asm.S" +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) @@ -679,5 +685,38 @@ #undef mlk_debug_check_bounds_int16m1 #undef mlk_debug_check_bounds_int16m2 #endif /* MLK_SYS_RISCV64 */ +#if defined(MLK_SYS_PPC64LE) +/* + * Undefine macros from native code (Arith, PPC64LE) + */ +/* mlkem/src/native/ppc64le/meta.h */ +#undef MLK_ARITH_BACKEND_NAME +#undef MLK_ARITH_BACKEND_PPC64LE_DEFAULT +#undef MLK_NATIVE_PPC64LE_META_H +#undef MLK_USE_NATIVE_INTT +#undef MLK_USE_NATIVE_NTT +#undef MLK_USE_NATIVE_POLY_REDUCE +#undef MLK_USE_NATIVE_POLY_TOMONT +/* mlkem/src/native/ppc64le/src/arith_native_ppc64le.h */ +#undef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#undef mlk_intt_ppc_asm +#undef mlk_ntt_ppc_asm +#undef mlk_poly_tomont_ppc_asm +#undef mlk_reduce_ppc_asm +/* mlkem/src/native/ppc64le/src/consts.h */ +#undef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#undef MLK_PPC_C1353_OFFSET +#undef MLK_PPC_C20159_OFFSET +#undef MLK_PPC_NQ_OFFSET +#undef MLK_PPC_N_INV_OFFSET +#undef MLK_PPC_N_INV_TW_OFFSET +#undef MLK_PPC_QINV_OFFSET +#undef MLK_PPC_Q_OFFSET +#undef MLK_PPC_ZETA_INTT_OFFSET +#undef MLK_PPC_ZETA_INTT_TW_OFFSET +#undef MLK_PPC_ZETA_NTT_OFFSET +#undef MLK_PPC_ZETA_NTT_TW_OFFSET +#undef mlk_ppc_qdata +#endif /* MLK_SYS_PPC64LE */ #endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */ #endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */ diff --git a/mlkem/src/native/meta.h b/mlkem/src/native/meta.h index 4291d629b1..dcd539ab13 100644 --- a/mlkem/src/native/meta.h +++ b/mlkem/src/native/meta.h @@ -22,4 +22,8 @@ #include "riscv64/meta.h" #endif +#ifdef MLK_SYS_PPC64LE +#include "ppc64le/meta.h" +#endif + #endif /* !MLK_NATIVE_META_H */ diff --git a/mlkem/src/native/ppc64le/README.md b/mlkem/src/native/ppc64le/README.md new file mode 100644 index 0000000000..733e32e113 --- /dev/null +++ b/mlkem/src/native/ppc64le/README.md @@ -0,0 +1,6 @@ +[//]: # (SPDX-License-Identifier: CC-BY-4.0) + +# ppc64le backend (little endian) + +This directory contains a native backend for little endian POWER 9 (ppc64le) and above systems. +Or, Power systems supporting ISA 2.07 and above. diff --git a/mlkem/src/native/ppc64le/meta.h b/mlkem/src/native/ppc64le/meta.h new file mode 100644 index 0000000000..9cd3b66cdd --- /dev/null +++ b/mlkem/src/native/ppc64le/meta.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_META_H +#define MLK_NATIVE_PPC64LE_META_H + +/* Identifier for this backend so that source and assembly files + * in the build can be appropriately guarded. */ +#define MLK_ARITH_BACKEND_PPC64LE_DEFAULT + +#define MLK_ARITH_BACKEND_NAME PPC64LE_DEFAULT + +/* Set of primitives that this backend replaces */ +#define MLK_USE_NATIVE_NTT +#define MLK_USE_NATIVE_INTT +#define MLK_USE_NATIVE_POLY_REDUCE +#define MLK_USE_NATIVE_POLY_TOMONT + +#if !defined(__ASSEMBLER__) +#include +#include "../../common.h" +#include "../../params.h" +#include "../api.h" +#include "src/arith_native_ppc64le.h" + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_ntt_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_intt_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_reduce_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} + +MLK_MUST_CHECK_RETURN_VALUE +static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N]) +{ +#if defined(__POWER8_VECTOR__) + mlk_poly_tomont_ppc_asm(data, mlk_ppc_qdata); + return MLK_NATIVE_FUNC_SUCCESS; +#else + (void)data; + return MLK_NATIVE_FUNC_FALLBACK; +#endif +} +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_META_H */ diff --git a/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h new file mode 100644 index 0000000000..3bd47ebd76 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#ifndef MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H +#define MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H + +#include +#include "../../../common.h" +#include "consts.h" + +#define mlk_ntt_ppc_asm MLK_NAMESPACE(ntt_ppc_asm) +void mlk_ntt_ppc_asm(int16_t *, const int16_t *); + +#define mlk_intt_ppc_asm MLK_NAMESPACE(intt_ppc_asm) +void mlk_intt_ppc_asm(int16_t *, const int16_t *); + +#define mlk_reduce_ppc_asm MLK_NAMESPACE(reduce_ppc_asm) +void mlk_reduce_ppc_asm(int16_t *r, const int16_t *); + +#define mlk_poly_tomont_ppc_asm MLK_NAMESPACE(poly_tomont_ppc_asm) +void mlk_poly_tomont_ppc_asm(int16_t *, const int16_t *); + +#endif /* !MLK_NATIVE_PPC64LE_SRC_ARITH_NATIVE_PPC64LE_H */ diff --git a/mlkem/src/native/ppc64le/src/consts.c b/mlkem/src/native/ppc64le/src/consts.c new file mode 100644 index 0000000000..4065b60231 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" + +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +#include "consts.h" + +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ +MLK_ALIGN MLK_INTERNAL_DATA_DEFINITION const int16_t mlk_ppc_qdata[2072] = { + /* -Q */ + /* check-magic: -3329 == -1 * MLKEM_Q */ + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + -3329, + /* QINV */ + /* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */ + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + -3327, + /* Q */ + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + 3329, + /* check-magic: 20159 == round(2^26 / MLKEM_Q) */ + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + 20159, + /* N^-1 in Montgomery form: pow(128,-1,MLKEM_Q) * 2^16 mod MLKEM_Q = 512. + * Multiplying by this via Barrett-fqmul scales INTT output by N^-1 and + * leaves it in Montgomery form (mlk_poly_invntt_tomont contract). */ + 512, + 512, + 512, + 512, + 512, + 512, + 512, + 512, + /* check-magic: 5040 == round((512 * 2**16 + MLKEM_Q) / MLKEM_Q) // 2 */ + /* Barrett twist of N^-1*R = round_to_even(N_INV_MONT * 2^16 / MLKEM_Q) / 2 + */ + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + 5040, + /* check-magic: 1353 == pow(2, 32, MLKEM_Q) */ + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, + 1353, +/* zetas for NTT */ +#include "consts_ntt.inc" +/* zetas for invNTT */ +#include "consts_intt.inc" +/* twisted zetas for NTT (Barrett high-mul) */ +#include "consts_ntt_tw.inc" +/* twisted zetas for invNTT (Barrett high-mul) */ +#include "consts_intt_tw.inc" +}; +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && \ + !MLK_CONFIG_MULTILEVEL_NO_SHARED && __POWER8_VECTOR__ */ diff --git a/mlkem/src/native/ppc64le/src/consts.h b/mlkem/src/native/ppc64le/src/consts.h new file mode 100644 index 0000000000..e35f89bce2 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#ifndef MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#define MLK_NATIVE_PPC64LE_SRC_CONSTS_H +#include "../../../common.h" + +/* Offsets into the constant table */ +/* check-magic: off */ +#define MLK_PPC_NQ_OFFSET 0 +#define MLK_PPC_QINV_OFFSET 16 +#define MLK_PPC_Q_OFFSET 32 +#define MLK_PPC_C20159_OFFSET 48 +#define MLK_PPC_N_INV_OFFSET 64 +#define MLK_PPC_N_INV_TW_OFFSET 80 +#define MLK_PPC_C1353_OFFSET 96 +#define MLK_PPC_ZETA_NTT_OFFSET 112 +#define MLK_PPC_ZETA_INTT_OFFSET 1120 +#define MLK_PPC_ZETA_NTT_TW_OFFSET 2128 +#define MLK_PPC_ZETA_INTT_TW_OFFSET 3136 +/* check-magic: on */ + +#ifndef __ASSEMBLER__ +#define mlk_ppc_qdata MLK_NAMESPACE(ppc_qdata) +/* 7 groups of 8 base constants + 4 twiddle tables * 63 rows * 8 values */ +/* check-magic: 2072 == 7 * 8 + 4 * 63 * 8 */ +MLK_INTERNAL_DATA_DECLARATION const int16_t mlk_ppc_qdata[2072]; +#endif /* !__ASSEMBLER__ */ + +#endif /* !MLK_NATIVE_PPC64LE_SRC_CONSTS_H */ diff --git a/mlkem/src/native/ppc64le/src/consts_intt.inc b/mlkem/src/native/ppc64le/src/consts_intt.inc new file mode 100644 index 0000000000..057b1df249 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -394, -394, -1175, -1175, -1219, -1219, 885, 885, + 1212, 1212, 1029, 1029, -1607, -1607, -1455, -1455, + -1179, -1179, 886, 886, 1143, 1143, -554, -554, + 1092, 1092, 1026, 1026, -525, -525, 403, 403, + 561, 561, -735, -735, -1230, -1230, -863, -863, + 319, 319, 757, 757, 1063, 1063, -556, -556, + -780, -780, 1645, 1645, 375, 375, -1239, -1239, + -1031, -1031, -109, -109, 1584, 1584, -1292, -1292, + -992, -992, 641, 641, 733, 733, 268, 268, + -1021, -1021, -941, -941, 939, 939, -892, -892, + 952, 952, -642, -642, -1482, -1482, 1461, 1461, + 1651, 1651, -1540, -1540, -1626, -1626, -540, -540, + -1173, -1173, -279, -279, 756, 756, -314, -314, + -667, -667, 233, 233, 1409, 1409, -48, -48, + 723, 723, 1100, 1100, 1637, 1637, -1041, -1041, + -568, -568, -680, -680, 17, 17, 583, 583, + 1227, 1227, 1227, 1227, 910, 910, 910, 910, + -855, -855, -855, -855, -219, -219, -219, -219, + 1481, 1481, 1481, 1481, 648, 648, 648, 648, + -682, -682, -682, -682, -712, -712, -712, -712, + 1534, 1534, 1534, 1534, -927, -927, -927, -927, + 1438, 1438, 1438, 1438, -461, -461, -461, -461, + 807, 807, 807, 807, 452, 452, 452, 452, + -1010, -1010, -1010, -1010, 1435, 1435, 1435, 1435, + 1320, 1320, 1320, 1320, -1414, -1414, -1414, -1414, + -464, -464, -464, -464, 33, 33, 33, 33, + -816, -816, -816, -816, 632, 632, 632, 632, + 650, 650, 650, 650, -1352, -1352, -1352, -1352, + -1052, -1052, -1052, -1052, -1274, -1274, -1274, -1274, + 1197, 1197, 1197, 1197, -1025, -1025, -1025, -1025, + -76, -76, -76, -76, -1573, -1573, -1573, -1573, + 289, 289, 289, 289, 331, 331, 331, 331, + 821, 821, 821, 821, 821, 821, 821, 821, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + -450, -450, -450, -450, -450, -450, -450, -450, + -936, -936, -936, -936, -936, -936, -936, -936, + -447, -447, -447, -447, -447, -447, -447, -447, + 535, 535, 535, 535, 535, 535, 535, 535, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 56, 56, 56, 56, 56, 56, 56, 56, + -283, -283, -283, -283, -283, -283, -283, -283, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + -882, -882, -882, -882, -882, -882, -882, -882, + 296, 296, 296, 296, 296, 296, 296, 296, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 569, 569, 569, 569, 569, 569, 569, 569, + -69, -69, -69, -69, -69, -69, -69, -69, + -543, -543, -543, -543, -543, -543, -543, -543, + 797, 797, 797, 797, 797, 797, 797, 797, + 193, 193, 193, 193, 193, 193, 193, 193, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + 848, 848, 848, 848, 848, 848, 848, 848, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 630, 630, 630, 630, 630, 630, 630, 630, + -687, -687, -687, -687, -687, -687, -687, -687, + -40, -40, -40, -40, -40, -40, -40, -40, + -749, -749, -749, -749, -749, -749, -749, -749, + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, diff --git a/mlkem/src/native/ppc64le/src/consts_intt_tw.inc b/mlkem/src/native/ppc64le/src/consts_intt_tw.inc new file mode 100644 index 0000000000..783e0af8f2 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_intt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE inverse NTT. + * See autogen for details. + */ + -3878, -3878, -11566, -11566, -11999, -11999, 8711, 8711, + 11930, 11930, 10129, 10129, -15818, -15818, -14322, -14322, + -11605, -11605, 8721, 8721, 11251, 11251, -5453, -5453, + 10749, 10749, 10099, 10099, -5168, -5168, 3967, 3967, + 5522, 5522, -7235, -7235, -12107, -12107, -8495, -8495, + 3140, 3140, 7451, 7451, 10463, 10463, -5473, -5473, + -7678, -7678, 16192, 16192, 3691, 3691, -12196, -12196, + -10148, -10148, -1073, -1073, 15592, 15592, -12717, -12717, + -9764, -9764, 6309, 6309, 7215, 7215, 2638, 2638, + -10050, -10050, -9262, -9262, 9243, 9243, -8780, -8780, + 9371, 9371, -6319, -6319, -14588, -14588, 14381, 14381, + 16251, 16251, -15159, -15159, -16005, -16005, -5315, -5315, + -11546, -11546, -2746, -2746, 7441, 7441, -3091, -3091, + -6565, -6565, 2293, 2293, 13869, 13869, -472, -472, + 7117, 7117, 10828, 10828, 16113, 16113, -10247, -10247, + -5591, -5591, -6693, -6693, 167, 167, 5739, 5739, + 12078, 12078, 12078, 12078, 8957, 8957, 8957, 8957, + -8416, -8416, -8416, -8416, -2156, -2156, -2156, -2156, + 14578, 14578, 14578, 14578, 6378, 6378, 6378, 6378, + -6713, -6713, -6713, -6713, -7008, -7008, -7008, -7008, + 15099, 15099, 15099, 15099, -9125, -9125, -9125, -9125, + 14155, 14155, 14155, 14155, -4538, -4538, -4538, -4538, + 7943, 7943, 7943, 7943, 4449, 4449, 4449, 4449, + -9942, -9942, -9942, -9942, 14125, 14125, 14125, 14125, + 12993, 12993, 12993, 12993, -13918, -13918, -13918, -13918, + -4567, -4567, -4567, -4567, 325, 325, 325, 325, + -8032, -8032, -8032, -8032, 6221, 6221, 6221, 6221, + 6398, 6398, 6398, 6398, -13308, -13308, -13308, -13308, + -10355, -10355, -10355, -10355, -12540, -12540, -12540, -12540, + 11782, 11782, 11782, 11782, -10089, -10089, -10089, -10089, + -748, -748, -748, -748, -15483, -15483, -15483, -15483, + 2845, 2845, 2845, 2845, 3258, 3258, 3258, 3258, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 551, 551, 551, 551, 551, 551, 551, 551, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -679, -679, -679, -679, -679, -679, -679, -679, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + -394, -394, -394, -394, -394, -394, -394, -394, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, diff --git a/mlkem/src/native/ppc64le/src/consts_ntt.inc b/mlkem/src/native/ppc64le/src/consts_ntt.inc new file mode 100644 index 0000000000..e53bf13713 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -1600, -1600, -1600, -1600, -1600, -1600, -1600, -1600, + -749, -749, -749, -749, -749, -749, -749, -749, + -40, -40, -40, -40, -40, -40, -40, -40, + -687, -687, -687, -687, -687, -687, -687, -687, + 630, 630, 630, 630, 630, 630, 630, 630, + -1432, -1432, -1432, -1432, -1432, -1432, -1432, -1432, + 848, 848, 848, 848, 848, 848, 848, 848, + 1062, 1062, 1062, 1062, 1062, 1062, 1062, 1062, + -1410, -1410, -1410, -1410, -1410, -1410, -1410, -1410, + 193, 193, 193, 193, 193, 193, 193, 193, + 797, 797, 797, 797, 797, 797, 797, 797, + -543, -543, -543, -543, -543, -543, -543, -543, + -69, -69, -69, -69, -69, -69, -69, -69, + 569, 569, 569, 569, 569, 569, 569, 569, + -1583, -1583, -1583, -1583, -1583, -1583, -1583, -1583, + 296, 296, 296, 296, 296, 296, 296, 296, + -882, -882, -882, -882, -882, -882, -882, -882, + 1339, 1339, 1339, 1339, 1339, 1339, 1339, 1339, + 1476, 1476, 1476, 1476, 1476, 1476, 1476, 1476, + -283, -283, -283, -283, -283, -283, -283, -283, + 56, 56, 56, 56, 56, 56, 56, 56, + -1089, -1089, -1089, -1089, -1089, -1089, -1089, -1089, + 1333, 1333, 1333, 1333, 1333, 1333, 1333, 1333, + 1426, 1426, 1426, 1426, 1426, 1426, 1426, 1426, + -1235, -1235, -1235, -1235, -1235, -1235, -1235, -1235, + 535, 535, 535, 535, 535, 535, 535, 535, + -447, -447, -447, -447, -447, -447, -447, -447, + -936, -936, -936, -936, -936, -936, -936, -936, + -450, -450, -450, -450, -450, -450, -450, -450, + -1355, -1355, -1355, -1355, -1355, -1355, -1355, -1355, + 821, 821, 821, 821, 821, 821, 821, 821, + 331, 331, 331, 331, 289, 289, 289, 289, + -1573, -1573, -1573, -1573, -76, -76, -76, -76, + -1025, -1025, -1025, -1025, 1197, 1197, 1197, 1197, + -1274, -1274, -1274, -1274, -1052, -1052, -1052, -1052, + -1352, -1352, -1352, -1352, 650, 650, 650, 650, + 632, 632, 632, 632, -816, -816, -816, -816, + 33, 33, 33, 33, -464, -464, -464, -464, + -1414, -1414, -1414, -1414, 1320, 1320, 1320, 1320, + 1435, 1435, 1435, 1435, -1010, -1010, -1010, -1010, + 452, 452, 452, 452, 807, 807, 807, 807, + -461, -461, -461, -461, 1438, 1438, 1438, 1438, + -927, -927, -927, -927, 1534, 1534, 1534, 1534, + -712, -712, -712, -712, -682, -682, -682, -682, + 648, 648, 648, 648, 1481, 1481, 1481, 1481, + -219, -219, -219, -219, -855, -855, -855, -855, + 910, 910, 910, 910, 1227, 1227, 1227, 1227, + 583, 583, 17, 17, -680, -680, -568, -568, + -1041, -1041, 1637, 1637, 1100, 1100, 723, 723, + -48, -48, 1409, 1409, 233, 233, -667, -667, + -314, -314, 756, 756, -279, -279, -1173, -1173, + -540, -540, -1626, -1626, -1540, -1540, 1651, 1651, + 1461, 1461, -1482, -1482, -642, -642, 952, 952, + -892, -892, 939, 939, -941, -941, -1021, -1021, + 268, 268, 733, 733, 641, 641, -992, -992, + -1292, -1292, 1584, 1584, -109, -109, -1031, -1031, + -1239, -1239, 375, 375, 1645, 1645, -780, -780, + -556, -556, 1063, 1063, 757, 757, 319, 319, + -863, -863, -1230, -1230, -735, -735, 561, 561, + 403, 403, -525, -525, 1026, 1026, 1092, 1092, + -554, -554, 1143, 1143, 886, 886, -1179, -1179, + -1455, -1455, -1607, -1607, 1029, 1029, 1212, 1212, + 885, 885, -1219, -1219, -1175, -1175, -394, -394, diff --git a/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc b/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc new file mode 100644 index 0000000000..a191b3bf2f --- /dev/null +++ b/mlkem/src/native/ppc64le/src/consts_ntt_tw.inc @@ -0,0 +1,77 @@ +/* + * Copyright (c) The mlkem-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * WARNING: This file is auto-generated from scripts/autogen + * in the mlkem-native repository. + * Do not modify it directly. + */ + +/* Twisted twiddle factors for the PPC64LE forward NTT. + * See autogen for details. + */ + -15749, -15749, -15749, -15749, -15749, -15749, -15749, -15749, + -7373, -7373, -7373, -7373, -7373, -7373, -7373, -7373, + -394, -394, -394, -394, -394, -394, -394, -394, + -6762, -6762, -6762, -6762, -6762, -6762, -6762, -6762, + 6201, 6201, 6201, 6201, 6201, 6201, 6201, 6201, + -14095, -14095, -14095, -14095, -14095, -14095, -14095, -14095, + 8347, 8347, 8347, 8347, 8347, 8347, 8347, 8347, + 10453, 10453, 10453, 10453, 10453, 10453, 10453, 10453, + -13879, -13879, -13879, -13879, -13879, -13879, -13879, -13879, + 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, + 7845, 7845, 7845, 7845, 7845, 7845, 7845, 7845, + -5345, -5345, -5345, -5345, -5345, -5345, -5345, -5345, + -679, -679, -679, -679, -679, -679, -679, -679, + 5601, 5601, 5601, 5601, 5601, 5601, 5601, 5601, + -15582, -15582, -15582, -15582, -15582, -15582, -15582, -15582, + 2914, 2914, 2914, 2914, 2914, 2914, 2914, 2914, + -8682, -8682, -8682, -8682, -8682, -8682, -8682, -8682, + 13180, 13180, 13180, 13180, 13180, 13180, 13180, 13180, + 14529, 14529, 14529, 14529, 14529, 14529, 14529, 14529, + -2786, -2786, -2786, -2786, -2786, -2786, -2786, -2786, + 551, 551, 551, 551, 551, 551, 551, 551, + -10719, -10719, -10719, -10719, -10719, -10719, -10719, -10719, + 13121, 13121, 13121, 13121, 13121, 13121, 13121, 13121, + 14036, 14036, 14036, 14036, 14036, 14036, 14036, 14036, + -12156, -12156, -12156, -12156, -12156, -12156, -12156, -12156, + 5266, 5266, 5266, 5266, 5266, 5266, 5266, 5266, + -4400, -4400, -4400, -4400, -4400, -4400, -4400, -4400, + -9213, -9213, -9213, -9213, -9213, -9213, -9213, -9213, + -4429, -4429, -4429, -4429, -4429, -4429, -4429, -4429, + -13338, -13338, -13338, -13338, -13338, -13338, -13338, -13338, + 8081, 8081, 8081, 8081, 8081, 8081, 8081, 8081, + 3258, 3258, 3258, 3258, 2845, 2845, 2845, 2845, + -15483, -15483, -15483, -15483, -748, -748, -748, -748, + -10089, -10089, -10089, -10089, 11782, 11782, 11782, 11782, + -12540, -12540, -12540, -12540, -10355, -10355, -10355, -10355, + -13308, -13308, -13308, -13308, 6398, 6398, 6398, 6398, + 6221, 6221, 6221, 6221, -8032, -8032, -8032, -8032, + 325, 325, 325, 325, -4567, -4567, -4567, -4567, + -13918, -13918, -13918, -13918, 12993, 12993, 12993, 12993, + 14125, 14125, 14125, 14125, -9942, -9942, -9942, -9942, + 4449, 4449, 4449, 4449, 7943, 7943, 7943, 7943, + -4538, -4538, -4538, -4538, 14155, 14155, 14155, 14155, + -9125, -9125, -9125, -9125, 15099, 15099, 15099, 15099, + -7008, -7008, -7008, -7008, -6713, -6713, -6713, -6713, + 6378, 6378, 6378, 6378, 14578, 14578, 14578, 14578, + -2156, -2156, -2156, -2156, -8416, -8416, -8416, -8416, + 8957, 8957, 8957, 8957, 12078, 12078, 12078, 12078, + 5739, 5739, 167, 167, -6693, -6693, -5591, -5591, + -10247, -10247, 16113, 16113, 10828, 10828, 7117, 7117, + -472, -472, 13869, 13869, 2293, 2293, -6565, -6565, + -3091, -3091, 7441, 7441, -2746, -2746, -11546, -11546, + -5315, -5315, -16005, -16005, -15159, -15159, 16251, 16251, + 14381, 14381, -14588, -14588, -6319, -6319, 9371, 9371, + -8780, -8780, 9243, 9243, -9262, -9262, -10050, -10050, + 2638, 2638, 7215, 7215, 6309, 6309, -9764, -9764, + -12717, -12717, 15592, 15592, -1073, -1073, -10148, -10148, + -12196, -12196, 3691, 3691, 16192, 16192, -7678, -7678, + -5473, -5473, 10463, 10463, 7451, 7451, 3140, 3140, + -8495, -8495, -12107, -12107, -7235, -7235, 5522, 5522, + 3967, 3967, -5168, -5168, 10099, 10099, 10749, 10749, + -5453, -5453, 11251, 11251, 8721, 8721, -11605, -11605, + -14322, -14322, -15818, -15818, 10129, 10129, 11930, 11930, + 8711, 8711, -11999, -11999, -11566, -11566, -3878, -3878, diff --git a/mlkem/src/native/ppc64le/src/intt_ppc_asm.S b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S new file mode 100644 index 0000000000..f4572b2855 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/intt_ppc_asm.S @@ -0,0 +1,3222 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/intt_ppc_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLK_ASM_NAMESPACE(intt_ppc_asm) +MLK_ASM_FN_SYMBOL(intt_ppc_asm) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvd2x 52, 10, 1 + stxvd2x 53, 11, 1 + stxvd2x 54, 12, 1 + stxvd2x 55, 14, 1 + stxvd2x 56, 15, 1 + stxvd2x 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvd2x 58, 10, 1 + stxvd2x 59, 11, 1 + stxvd2x 60, 12, 1 + stxvd2x 61, 14, 1 + stxvd2x 62, 15, 1 + stxvd2x 63, 16, 1 + lxvd2x 0, 0, 4 + xxlxor 35, 35, 35 + xxlor 3, 35, 35 + li 10, 32 + li 11, 48 + lxvd2x 6, 10, 4 + lxvd2x 32, 11, 4 + vspltisw 8, 13 + vadduwm 8, 8, 8 + xxlor 8, 40, 40 + vspltisw 9, 1 + vsubuwm 10, 8, 9 + vslw 9, 9, 10 + xxlor 7, 41, 41 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + addi 14, 4, 64 + lvx 10, 0, 14 + addi 14, 4, 80 + lvx 11, 0, 14 + li 8, 4 + mtctr 8 + xxlor 37, 0, 0 + +intt_ppc_asm_Loopf: + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmhraddshs 14, 25, 11, 3 + vmhraddshs 19, 26, 11, 3 + vmhraddshs 24, 30, 11, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 6, 25, 10, 3 + vmladduhm 7, 26, 10, 3 + vmladduhm 8, 30, 10, 3 + vmladduhm 9, 31, 10, 3 + vmladduhm 6, 14, 5, 6 + vmladduhm 7, 19, 5, 7 + vmladduhm 8, 24, 5, 8 + vmladduhm 9, 29, 5, 9 + lxvd2x 57, 0, 3 + lxvd2x 58, 10, 3 + lxvd2x 62, 11, 3 + lxvd2x 63, 12, 3 + addi 3, 3, 64 + vmhraddshs 14, 25, 11, 3 + vmhraddshs 19, 26, 11, 3 + vmhraddshs 24, 30, 11, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + addi 3, 3, -128 + stxvd2x 38, 0, 3 + stxvd2x 39, 10, 3 + stxvd2x 40, 11, 3 + stxvd2x 41, 12, 3 + stxvd2x 45, 15, 3 + stxvd2x 50, 16, 3 + stxvd2x 55, 17, 3 + stxvd2x 60, 18, 3 + addi 3, 3, 128 + bdnz intt_ppc_asm_Loopf + addi 3, 3, -512 + nop + nop + ori 2, 2, 0 + addi 14, 4, 1120 + addi 22, 4, 3136 + li 7, 4 + mr 5, 3 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 8, 25, 26 + vmrgow 21, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 12, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 16, 25, 26 + vmrgow 23, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 20, 25, 26 + vmrgow 24, 25, 26 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + vmrgew 10, 13, 14 + vmrgow 11, 13, 14 + vmrgew 12, 18, 19 + vmrgow 13, 18, 19 + vmrgew 14, 23, 24 + vmrgow 15, 23, 24 + vmrgew 16, 28, 29 + vmrgow 17, 28, 29 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + mr 5, 3 + li 7, 8 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + lxvd2x 10, 0, 5 + lxvd2x 11, 10, 5 + xxmrgld 40, 11, 10 + xxmrghd 53, 11, 10 + lxvd2x 10, 11, 5 + lxvd2x 11, 12, 5 + xxmrgld 44, 11, 10 + xxmrghd 54, 11, 10 + lxvd2x 10, 15, 5 + lxvd2x 11, 16, 5 + xxmrgld 48, 11, 10 + xxmrghd 55, 11, 10 + lxvd2x 10, 17, 5 + lxvd2x 11, 18, 5 + xxmrgld 52, 11, 10 + xxmrghd 56, 11, 10 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + xxlor 10, 36, 36 + xxlor 11, 41, 41 + xxlor 12, 45, 45 + xxlor 13, 49, 49 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + xxlor 46, 10, 10 + xxlor 51, 11, 11 + xxlor 56, 12, 12 + xxlor 61, 13, 13 + xxmrgld 42, 46, 45 + xxmrghd 43, 46, 45 + xxmrgld 44, 51, 50 + xxmrghd 45, 51, 50 + xxmrgld 46, 56, 55 + xxmrghd 47, 56, 55 + xxmrgld 48, 61, 60 + xxmrghd 49, 61, 60 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + nop + ori 2, 2, 0 + li 7, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 7, 32 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, -64 + addi 22, 22, -64 + li 9, 16 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, -64 + addi 22, 22, -64 + li 9, 272 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + li 8, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 8, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 44, 0, 22 + lxvd2x 52, 8, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 20, 3 + vmhraddshs 24, 30, 6, 3 + vmhraddshs 29, 31, 11, 3 + vmladduhm 13, 25, 7, 3 + vmladduhm 18, 26, 8, 3 + vmladduhm 23, 30, 9, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + nop + nop + ori 2, 2, 0 + li 7, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + nop + nop + ori 2, 2, 0 + li 7, 128 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 320 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + addi 14, 14, 16 + addi 22, 22, 16 + nop + nop + ori 2, 2, 0 + li 7, 256 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 9, 192 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 40, 3, 10 + lxvd2x 44, 3, 17 + lxvd2x 48, 3, 19 + lxvd2x 52, 3, 21 + lxvd2x 53, 3, 9 + lxvd2x 54, 3, 16 + lxvd2x 55, 3, 18 + lxvd2x 56, 3, 20 + vsubuhm 25, 8, 21 + vsubuhm 26, 12, 22 + vsubuhm 30, 16, 23 + vsubuhm 31, 20, 24 + vadduhm 8, 8, 21 + vadduhm 12, 12, 22 + vadduhm 16, 16, 23 + vadduhm 20, 20, 24 + vxor 7, 7, 7 + xxlor 35, 6, 6 + xxlor 33, 7, 7 + xxlor 34, 8, 8 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 36, 3, 9 + stxvd2x 41, 3, 16 + stxvd2x 45, 3, 18 + stxvd2x 49, 3, 20 + xxlor 37, 0, 0 + xxlor 35, 3, 3 + lvx 10, 0, 14 + lvx 12, 0, 22 + vmhraddshs 14, 25, 12, 3 + vmhraddshs 19, 26, 12, 3 + vmhraddshs 24, 30, 12, 3 + vmhraddshs 29, 31, 12, 3 + vmladduhm 13, 25, 10, 3 + vmladduhm 18, 26, 10, 3 + vmladduhm 23, 30, 10, 3 + vmladduhm 28, 31, 10, 3 + vmladduhm 13, 14, 5, 13 + vmladduhm 18, 19, 5, 18 + vmladduhm 23, 24, 5, 23 + vmladduhm 28, 29, 5, 28 + stxvd2x 45, 3, 10 + stxvd2x 50, 3, 17 + stxvd2x 55, 3, 19 + stxvd2x 60, 3, 21 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvd2x 52, 10, 1 + lxvd2x 53, 11, 1 + lxvd2x 54, 12, 1 + lxvd2x 55, 14, 1 + lxvd2x 56, 15, 1 + lxvd2x 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvd2x 58, 10, 1 + lxvd2x 59, 11, 1 + lxvd2x 60, 12, 1 + lxvd2x 61, 14, 1 + lxvd2x 62, 15, 1 + lxvd2x 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + mtlr 0 + addi 1, 1, 352 + blr + +MLK_ASM_FN_SIZE(intt_ppc_asm) + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S new file mode 100644 index 0000000000..6a99943b86 --- /dev/null +++ b/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S @@ -0,0 +1,1651 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/ntt_ppc_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLK_ASM_NAMESPACE(ntt_ppc_asm) +MLK_ASM_FN_SYMBOL(ntt_ppc_asm) + + stdu 1, -352(1) + mflr 0 + std 14, 56(1) + std 15, 64(1) + std 16, 72(1) + std 17, 80(1) + std 18, 88(1) + std 19, 96(1) + std 20, 104(1) + std 21, 112(1) + std 22, 120(1) + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + stxvd2x 52, 10, 1 + stxvd2x 53, 11, 1 + stxvd2x 54, 12, 1 + stxvd2x 55, 14, 1 + stxvd2x 56, 15, 1 + stxvd2x 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + stxvd2x 58, 10, 1 + stxvd2x 59, 11, 1 + stxvd2x 60, 12, 1 + stxvd2x 61, 14, 1 + stxvd2x 62, 15, 1 + stxvd2x 63, 16, 1 + lvx 5, 0, 4 + addi 14, 4, 112 + addi 22, 4, 2128 + vxor 3, 3, 3 + ori 2, 2, 0 + li 7, 256 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 192 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 7, 128 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 64 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 320 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 7, 64 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + lvx 10, 0, 14 + lvx 2, 0, 22 + addi 14, 14, 16 + addi 22, 22, 16 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 16 + addi 17, 10, 16 + addi 18, 16, 16 + addi 19, 17, 16 + addi 20, 18, 16 + addi 21, 19, 16 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 2, 3 + vmhraddshs 20, 18, 2, 3 + vmhraddshs 25, 23, 2, 3 + vmhraddshs 30, 28, 2, 3 + vmladduhm 13, 13, 10, 3 + vmladduhm 18, 18, 10, 3 + vmladduhm 23, 23, 10, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + nop + nop + ori 2, 2, 0 + li 7, 32 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 16 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 9, 272 + add 10, 7, 9 + addi 16, 9, 64 + addi 17, 10, 64 + addi 18, 16, 64 + addi 19, 17, 64 + addi 20, 18, 64 + addi 21, 19, 64 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + ori 2, 2, 0 + li 7, 16 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 0 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 128 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 256 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + li 9, 384 + add 10, 7, 9 + addi 16, 9, 32 + addi 17, 10, 32 + addi 18, 16, 32 + addi 19, 17, 32 + addi 20, 18, 32 + addi 21, 19, 32 + lxvd2x 45, 3, 10 + lxvd2x 50, 3, 17 + lxvd2x 55, 3, 19 + lxvd2x 60, 3, 21 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + lxvd2x 44, 3, 9 + lxvd2x 49, 3, 16 + lxvd2x 54, 3, 18 + lxvd2x 59, 3, 20 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + stxvd2x 47, 3, 9 + stxvd2x 48, 3, 10 + stxvd2x 52, 3, 16 + stxvd2x 53, 3, 17 + stxvd2x 57, 3, 18 + stxvd2x 58, 3, 19 + stxvd2x 62, 3, 20 + stxvd2x 63, 3, 21 + mr 5, 3 + li 7, 8 + li 10, 16 + li 11, 32 + li 12, 48 + li 15, 64 + li 16, 80 + li 17, 96 + li 18, 112 + nop + ori 2, 2, 0 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 1, 0, 5 + lxvd2x 2, 10, 5 + xxmrgld 45, 2, 1 + xxmrghd 44, 2, 1 + lxvd2x 3, 11, 5 + lxvd2x 4, 12, 5 + xxmrgld 50, 4, 3 + xxmrghd 49, 4, 3 + lxvd2x 1, 15, 5 + lxvd2x 2, 16, 5 + xxmrgld 55, 2, 1 + xxmrghd 54, 2, 1 + lxvd2x 3, 17, 5 + lxvd2x 4, 18, 5 + xxmrgld 60, 4, 3 + xxmrghd 59, 4, 3 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + xxmrgld 0, 47, 48 + xxmrghd 1, 47, 48 + xxmrgld 2, 52, 53 + xxmrghd 3, 52, 53 + xxmrgld 4, 57, 58 + xxmrghd 5, 57, 58 + xxmrgld 6, 62, 63 + xxmrghd 7, 62, 63 + stxvd2x 0, 0, 5 + stxvd2x 1, 10, 5 + stxvd2x 2, 11, 5 + stxvd2x 3, 12, 5 + stxvd2x 4, 15, 5 + stxvd2x 5, 16, 5 + stxvd2x 6, 17, 5 + stxvd2x 7, 18, 5 + addi 5, 5, 128 + mr 5, 3 + li 7, 4 + nop + ori 2, 2, 0 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 16 + li 11, 32 + li 12, 48 + lxvd2x 39, 0, 14 + lxvd2x 40, 10, 14 + lxvd2x 41, 11, 14 + lxvd2x 42, 12, 14 + lxvd2x 32, 0, 22 + lxvd2x 33, 10, 22 + lxvd2x 38, 11, 22 + lxvd2x 43, 12, 22 + addi 14, 14, 64 + addi 22, 22, 64 + lxvd2x 57, 0, 5 + lxvd2x 58, 10, 5 + vmrgew 13, 25, 26 + vmrgow 12, 25, 26 + lxvd2x 57, 11, 5 + lxvd2x 58, 12, 5 + vmrgew 18, 25, 26 + vmrgow 17, 25, 26 + lxvd2x 57, 15, 5 + lxvd2x 58, 16, 5 + vmrgew 23, 25, 26 + vmrgow 22, 25, 26 + lxvd2x 57, 17, 5 + lxvd2x 58, 18, 5 + vmrgew 28, 25, 26 + vmrgow 27, 25, 26 + vmhraddshs 15, 13, 0, 3 + vmhraddshs 20, 18, 1, 3 + vmhraddshs 25, 23, 6, 3 + vmhraddshs 30, 28, 11, 3 + vmladduhm 13, 13, 7, 3 + vmladduhm 18, 18, 8, 3 + vmladduhm 23, 23, 9, 3 + vmladduhm 28, 28, 10, 3 + vmladduhm 13, 15, 5, 13 + vmladduhm 18, 20, 5, 18 + vmladduhm 23, 25, 5, 23 + vmladduhm 28, 30, 5, 28 + vsubuhm 16, 12, 13 + vadduhm 15, 13, 12 + vsubuhm 21, 17, 18 + vadduhm 20, 18, 17 + vsubuhm 26, 22, 23 + vadduhm 25, 23, 22 + vsubuhm 31, 27, 28 + vadduhm 30, 28, 27 + vmrgew 10, 16, 15 + vmrgow 11, 16, 15 + vmrgew 12, 21, 20 + vmrgow 13, 21, 20 + vmrgew 14, 26, 25 + vmrgow 15, 26, 25 + vmrgew 16, 31, 30 + vmrgow 17, 31, 30 + stxvd2x 42, 0, 5 + stxvd2x 43, 10, 5 + stxvd2x 44, 11, 5 + stxvd2x 45, 12, 5 + stxvd2x 46, 15, 5 + stxvd2x 47, 16, 5 + stxvd2x 48, 17, 5 + stxvd2x 49, 18, 5 + addi 5, 5, 128 + li 10, 128 + li 11, 144 + li 12, 160 + li 14, 176 + li 15, 192 + li 16, 208 + lxvd2x 52, 10, 1 + lxvd2x 53, 11, 1 + lxvd2x 54, 12, 1 + lxvd2x 55, 14, 1 + lxvd2x 56, 15, 1 + lxvd2x 57, 16, 1 + li 10, 224 + li 11, 240 + li 12, 256 + li 14, 272 + li 15, 288 + li 16, 304 + lxvd2x 58, 10, 1 + lxvd2x 59, 11, 1 + lxvd2x 60, 12, 1 + lxvd2x 61, 14, 1 + lxvd2x 62, 15, 1 + lxvd2x 63, 16, 1 + ld 14, 56(1) + ld 15, 64(1) + ld 16, 72(1) + ld 17, 80(1) + ld 18, 88(1) + ld 19, 96(1) + ld 20, 104(1) + ld 21, 112(1) + ld 22, 120(1) + mtlr 0 + addi 1, 1, 352 + blr + +MLK_ASM_FN_SIZE(ntt_ppc_asm) + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S new file mode 100644 index 0000000000..170012c3bf --- /dev/null +++ b/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S @@ -0,0 +1,359 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ + +/* + * Poly_tomont: Inplace conversion of all coefficients of a polynomial + * from normal domain to Montgomery domain + * + * Arguments:*r: pointer to input/output polynomial + */ + +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/poly_tomont_ppc_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLK_ASM_NAMESPACE(poly_tomont_ppc_asm) +MLK_ASM_FN_SYMBOL(poly_tomont_ppc_asm) + + stdu 1, -320(1) + mflr 0 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + stxvd2x 52, 6, 1 + stxvd2x 53, 7, 1 + stxvd2x 54, 8, 1 + stxvd2x 55, 9, 1 + stxvd2x 56, 10, 1 + stxvd2x 57, 11, 1 + stxvd2x 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + stxvd2x 59, 6, 1 + stxvd2x 60, 7, 1 + stxvd2x 61, 8, 1 + stxvd2x 62, 9, 1 + li 6, 0 + li 7, 16 + li 8, 96 + lxvd2x 37, 6, 4 + lxvd2x 34, 7, 4 + lxvd2x 32, 8, 4 + vxor 3, 3, 3 + vspltish 4, 1 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 27, 15, 4 + vsrah 28, 20, 4 + vsrah 29, 25, 4 + vsrah 30, 9, 4 + lxvd2x 45, 0, 3 + addi 3, 3, 16 + lxvd2x 50, 0, 3 + addi 3, 3, 16 + lxvd2x 55, 0, 3 + addi 3, 3, 16 + lxvd2x 39, 0, 3 + addi 3, 3, 16 + vmladduhm 15, 13, 0, 3 + vmladduhm 20, 18, 0, 3 + vmladduhm 25, 23, 0, 3 + vmladduhm 9, 7, 0, 3 + vmhraddshs 14, 13, 0, 3 + vmhraddshs 19, 18, 0, 3 + vmhraddshs 24, 23, 0, 3 + vmhraddshs 8, 7, 0, 3 + vmladduhm 15, 15, 2, 3 + vmladduhm 20, 20, 2, 3 + vmladduhm 25, 25, 2, 3 + vmladduhm 9, 9, 2, 3 + vmhraddshs 15, 15, 5, 14 + vmhraddshs 20, 20, 5, 19 + vmhraddshs 25, 25, 5, 24 + vmhraddshs 9, 9, 5, 8 + vsrah 13, 15, 4 + vsrah 18, 20, 4 + vsrah 23, 25, 4 + vsrah 7, 9, 4 + stxvd2x 59, 4, 3 + stxvd2x 60, 5, 3 + stxvd2x 61, 6, 3 + stxvd2x 62, 7, 3 + stxvd2x 45, 8, 3 + stxvd2x 50, 9, 3 + stxvd2x 55, 10, 3 + stxvd2x 39, 11, 3 + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + li 11, 208 + li 12, 224 + lxvd2x 52, 6, 1 + lxvd2x 53, 7, 1 + lxvd2x 54, 8, 1 + lxvd2x 55, 9, 1 + lxvd2x 56, 10, 1 + lxvd2x 57, 11, 1 + lxvd2x 58, 12, 1 + li 6, 240 + li 7, 256 + li 8, 272 + li 9, 288 + lxvd2x 59, 6, 1 + lxvd2x 60, 7, 1 + lxvd2x 61, 8, 1 + lxvd2x 62, 9, 1 + mtlr 0 + addi 1, 1, 320 + blr + +MLK_ASM_FN_SIZE(poly_tomont_ppc_asm) + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S new file mode 100644 index 0000000000..25814d05bb --- /dev/null +++ b/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S @@ -0,0 +1,710 @@ +/* + * Copyright (c) The mlkem-native project authors + * Copyright (c) IBM Corp. 2025, 2026 + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + * + * Written by Danny Tsen + */ +#include "../../../common.h" +#if defined(MLK_ARITH_BACKEND_PPC64LE_DEFAULT) && \ + !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(__POWER8_VECTOR__) + +/* + * WARNING: This file is auto-derived from the mlkem-native source file + * dev/ppc64le/src/reduce_ppc_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLK_ASM_NAMESPACE(reduce_ppc_asm) +MLK_ASM_FN_SYMBOL(reduce_ppc_asm) + + stdu 1, -224(1) + mflr 0 + std 14, 96(1) + std 15, 104(1) + std 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + stxvd2x 52, 6, 1 + stxvd2x 53, 7, 1 + stxvd2x 54, 8, 1 + stxvd2x 55, 9, 1 + stxvd2x 56, 10, 1 + vxor 7, 7, 7 + li 6, 32 + li 7, 48 + lxvd2x 35, 6, 4 + lxvd2x 32, 7, 4 + vspltisw 2, 13 + vadduwm 2, 2, 2 + vspltisw 4, 1 + vsubuwm 5, 2, 4 + vslw 1, 4, 5 + li 4, -128 + li 5, -112 + li 6, -96 + li 7, -80 + li 8, -64 + li 9, -48 + li 10, -32 + li 11, -16 + li 14, 16 + li 15, 32 + li 16, 48 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 21, 4, 3, 8 + vmladduhm 22, 9, 3, 12 + vmladduhm 23, 13, 3, 16 + vmladduhm 24, 17, 3, 20 + lxvd2x 40, 0, 3 + lxvd2x 44, 14, 3 + lxvd2x 48, 15, 3 + lxvd2x 52, 16, 3 + addi 3, 3, 64 + vmulosh 6, 8, 0 + vmulesh 5, 8, 0 + vmulosh 11, 12, 0 + vmulesh 10, 12, 0 + vmulosh 15, 16, 0 + vmulesh 14, 16, 0 + vmulosh 19, 20, 0 + vmulesh 18, 20, 0 + xxmrglw 36, 37, 38 + xxmrghw 37, 37, 38 + xxmrglw 41, 42, 43 + xxmrghw 42, 42, 43 + xxmrglw 45, 46, 47 + xxmrghw 46, 46, 47 + xxmrglw 49, 50, 51 + xxmrghw 50, 50, 51 + vadduwm 4, 4, 1 + vadduwm 5, 5, 1 + vadduwm 9, 9, 1 + vadduwm 10, 10, 1 + vadduwm 13, 13, 1 + vadduwm 14, 14, 1 + vadduwm 17, 17, 1 + vadduwm 18, 18, 1 + vsraw 4, 4, 2 + vsraw 5, 5, 2 + vsraw 9, 9, 2 + vsraw 10, 10, 2 + vsraw 13, 13, 2 + vsraw 14, 14, 2 + vsraw 17, 17, 2 + vsraw 18, 18, 2 + vpkuwum 4, 5, 4 + vsubuhm 4, 7, 4 + vpkuwum 9, 10, 9 + vsubuhm 9, 7, 9 + vpkuwum 13, 14, 13 + vsubuhm 13, 7, 13 + vpkuwum 17, 18, 17 + vsubuhm 17, 7, 17 + vmladduhm 4, 4, 3, 8 + vmladduhm 9, 9, 3, 12 + vmladduhm 13, 13, 3, 16 + vmladduhm 17, 17, 3, 20 + stxvd2x 53, 4, 3 + stxvd2x 54, 5, 3 + stxvd2x 55, 6, 3 + stxvd2x 56, 7, 3 + stxvd2x 36, 8, 3 + stxvd2x 41, 9, 3 + stxvd2x 45, 10, 3 + stxvd2x 49, 11, 3 + addi 3, 3, -512 + vxor 9, 9, 9 + vspltish 10, 15 + vmr 11, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + lxvd2x 44, 0, 3 + lxvd2x 45, 14, 3 + lxvd2x 46, 15, 3 + lxvd2x 47, 16, 3 + addi 3, 3, 64 + vsrh 1, 12, 10 + vsrh 0, 13, 10 + vsrh 3, 14, 10 + vsrh 2, 15, 10 + vadduhm 7, 12, 11 + vadduhm 8, 13, 11 + vadduhm 5, 14, 11 + vadduhm 6, 15, 11 + vcmpequh 1, 1, 9 + vcmpequh 0, 0, 9 + vcmpequh 3, 3, 9 + vcmpequh 2, 2, 9 + xxsel 33, 39, 44, 33 + xxsel 32, 40, 45, 32 + xxsel 35, 37, 46, 35 + xxsel 34, 38, 47, 34 + stxvd2x 35, 10, 3 + stxvd2x 34, 11, 3 + stxvd2x 33, 8, 3 + stxvd2x 32, 9, 3 + ld 14, 96(1) + ld 15, 104(1) + ld 16, 112(1) + li 6, 128 + li 7, 144 + li 8, 160 + li 9, 176 + li 10, 192 + lxvd2x 52, 6, 1 + lxvd2x 53, 7, 1 + lxvd2x 54, 8, 1 + lxvd2x 55, 9, 1 + lxvd2x 56, 10, 1 + mtlr 0 + addi 1, 1, 224 + blr + +MLK_ASM_FN_SIZE(reduce_ppc_asm) + +#endif /* MLK_ARITH_BACKEND_PPC64LE_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \ + && __POWER8_VECTOR__ */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/scripts/autogen b/scripts/autogen index ba7801fb52..d87b470eca 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -497,7 +497,7 @@ class CondParser: def parse_condition(self, exp, simplify=True): try: - exp = self.parser.parseString(exp, parseAll=True).as_list()[0] + exp = self.parser.parse_string(exp, parse_all=True).as_list()[0] except pp.ParseException: print(f"WARNING: Ignoring condition '{exp}' I cannot parse") return exp @@ -2083,6 +2083,150 @@ def gen_riscv64_zeta_files(): ) +# The PPC64LE backend stores its twiddle factors in two include files, +# `consts_ntt.inc` and `consts_intt.inc`, which are concatenated into the +# `mlk_ppc_qdata` table in `consts.c`. The values are the canonical Montgomery +# zetas (matching `mlk_zetas` in `mlkem/src/zetas.inc`), arranged per layer to +# match the order in which the assembly consumes them. +# +# NTT layout (in order): +# - Layers Len=128, 64, 32, 16, 8: 31 zetas, each broadcast 8 times, +# in canonical order (`mlk_zetas[1..31]`). +# - Layer Len=4: 32 zetas, each broadcast 4 times, with adjacent pairs +# swapped (`mlk_zetas[32 + (i^1)]`). +# - Layer Len=2: 64 zetas (`mlk_zetas[64..127]`), grouped in 16 quadruples; +# each quadruple is permuted by (1,2,3,4) -> (3,1,4,2) and each entry +# duplicated to fill an 8-halfword vector. +# +# Inverse NTT layout (in order): +# - Layer Len=2: same permutation/duplication as the NTT, but applied to +# `mlk_zetas[127..64]` grouped in quadruples of decreasing index. +# - Layer Len=4: 32 zetas, each broadcast 4 times, in reverse canonical +# order with adjacent pairs swapped (`mlk_zetas[63 - (i^1)]`). +# - Layers Len=8, 16, 32, 64, 128: 31 zetas, each broadcast 8 times, +# in reverse canonical order (`mlk_zetas[31..1]`). +_PPC64LE_LEN2_PERM = [2, 0, 3, 1] + + +def gen_c_real_zetas(): + # Same bit-reversed ordering as `gen_c_zetas`, but without the + # Montgomery factor (raw `z = root^idx mod q`). + zeta = [signed_reduce(pow(root_of_unity, i, modulus)) for i in range(128)] + yield from (zeta[bitreverse(i, 7)] for i in range(128)) + + +def _gen_ppc64le_ntt_zeta_layout(transform): + z = [transform(t) for t in gen_c_real_zetas()] + + # Layers Len=128, 64, 32, 16, 8: broadcast x 8. + for i in range(1, 32): + yield from [z[i]] * 8 + + # Layer Len=4: broadcast x 4, adjacent pairs swapped. + for i in range(32): + yield from [z[32 + (i ^ 1)]] * 4 + + # Layer Len=2: per group of 4 zetas, permute (3,1,4,2) and duplicate each. + for g in range(16): + src = z[64 + g * 4 : 64 + (g + 1) * 4] + for p in _PPC64LE_LEN2_PERM: + yield from [src[p]] * 2 + + +def _gen_ppc64le_intt_zeta_layout(transform): + z = [transform(t) for t in gen_c_real_zetas()] + + # Layer Len=2: per group of 4 reverse-ordered zetas, same permutation + # and duplication as the NTT. + for g in range(16): + src = [z[127 - g * 4 - i] for i in range(4)] + for p in _PPC64LE_LEN2_PERM: + yield from [src[p]] * 2 + + # Layer Len=4: broadcast x 4, reverse canonical order, adjacent pairs + # swapped. + for i in range(32): + yield from [z[63 - (i ^ 1)]] * 4 + + # Layers Len=8, 16, 32, 64, 128: broadcast x 8, reverse canonical order. + for i in range(31): + yield from [z[31 - i]] * 8 + + +def _ppc64le_zeta_id(z): + return signed_reduce(z) + + +def _ppc64le_zeta_twist(z): + return prepare_root_for_barrett(z)[1] + + +def gen_ppc64le_ntt_zetas(): + yield from _gen_ppc64le_ntt_zeta_layout(_ppc64le_zeta_id) + + +def gen_ppc64le_intt_zetas(): + yield from _gen_ppc64le_intt_zeta_layout(_ppc64le_zeta_id) + + +def gen_ppc64le_ntt_twist_zetas(): + yield from _gen_ppc64le_ntt_zeta_layout(_ppc64le_zeta_twist) + + +def gen_ppc64le_intt_twist_zetas(): + yield from _gen_ppc64le_intt_zeta_layout(_ppc64le_zeta_twist) + + +def gen_ppc64le_zeta_files(): + """Generate PPC64LE zeta include files.""" + + def gen_inc(zetas, leading_comment, entries_per_line=8): + yield from gen_header() + yield leading_comment + zetas = list(zetas) + for i in range(0, len(zetas), entries_per_line): + chunk = zetas[i : i + entries_per_line] + yield " " + ", ".join(str(t) for t in chunk) + "," + yield "" + + ntt_content = "\n".join( + gen_inc( + gen_ppc64le_ntt_zetas(), + "/* Twiddle factors for the PPC64LE forward NTT.\n * See autogen for details.\n */", + ) + ) + intt_content = "\n".join( + gen_inc( + gen_ppc64le_intt_zetas(), + "/* Twiddle factors for the PPC64LE inverse NTT.\n * See autogen for details.\n */", + ) + ) + ntt_tw_content = "\n".join( + gen_inc( + gen_ppc64le_ntt_twist_zetas(), + "/* Twisted twiddle factors for the PPC64LE forward NTT.\n * See autogen for details.\n */", + ) + ) + intt_tw_content = "\n".join( + gen_inc( + gen_ppc64le_intt_twist_zetas(), + "/* Twisted twiddle factors for the PPC64LE inverse NTT.\n * See autogen for details.\n */", + ) + ) + + # The .inc files are #include'd by `consts.c` (not by an .S file), so they + # are not inlined via simpasm; we therefore write them directly into both + # the developer tree and the mlkem mirror. + for filename, content in ( + ("consts_ntt.inc", ntt_content), + ("consts_intt.inc", intt_content), + ("consts_ntt_tw.inc", ntt_tw_content), + ("consts_intt_tw.inc", intt_tw_content), + ): + for tree in ("dev/ppc64le/src", "mlkem/src/native/ppc64le/src"): + update_file(f"{tree}/{filename}", content) + + def get_c_source_files(main_only=False, core_only=False, strip_mlkem=False): if main_only is True: return get_files("mlkem/src/**/*.c", strip_mlkem=strip_mlkem) @@ -2241,6 +2385,10 @@ def riscv64(c): return "/riscv64/" in c +def ppc64le(c): + return "/ppc64le/" in c + + def armv81m(c): return "/armv81m/" in c @@ -2286,12 +2434,17 @@ def native_arith_riscv64(c): return native_arith(c) and riscv64(c) +def native_arith_ppc64le(c): + return native_arith(c) and ppc64le(c) + + def native_arith_core(c): return ( native_arith(c) and not native_arith_x86_64(c) and not native_arith_aarch64(c) and not native_arith_riscv64(c) + and not native_arith_ppc64le(c) ) @@ -2400,6 +2553,11 @@ def gen_macro_undefs(extra_notes=None): filt=native_arith_riscv64, desc="native code (Arith, RISC-V 64)" ) yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + yield from gen_monolithic_undef_all_core( + filt=native_arith_ppc64le, desc="native code (Arith, PPC64LE)" + ) + yield "#endif" yield "#endif" yield "#endif" yield "" @@ -2481,6 +2639,10 @@ def gen_monolithic_source_file(): for c in filter(native_arith_riscv64, c_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, c_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2568,6 +2730,10 @@ def gen_monolithic_asm_file(): for c in filter(native_arith_riscv64, asm_sources): yield f'#include "{c}"' yield "#endif" + yield "#if defined(MLK_SYS_PPC64LE)" + for c in filter(native_arith_ppc64le, asm_sources): + yield f'#include "{c}"' + yield "#endif" yield "#endif" yield "" yield "#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)" @@ -2893,6 +3059,8 @@ def update_via_simpasm( source_arch = "x86_64" elif "armv81m" in infile_full: source_arch = "armv81m" + elif "ppc64le" in infile_full: + source_arch = "ppc64le" else: raise Exception(f"Could not detect architecture of source file {infile_full}.") # Check native architecture @@ -2910,13 +3078,21 @@ def update_via_simpasm( return raise Exception(f"Could not find cross toolchain {cross_prefix}") elif native_arch != source_arch: - cross_prefix = f"{source_arch}-unknown-linux-gnu-" + # PPC64LE uses "powerpc64le" in the GNU triple, not the bare "ppc64le". + arch_triple = "powerpc64le" if source_arch == "ppc64le" else source_arch + cross_prefix = f"{arch_triple}-unknown-linux-gnu-" cross_gcc = cross_prefix + "gcc" # Check if cross-compiler is present if shutil.which(cross_gcc) is None: - if force_cross is False: + if "--target=" in (cflags or ""): + # No cross-gcc, but an explicit target triple is already in + # cflags (e.g. via --cflags on macOS). Let simpasm + # use its default cc/nm (e.g. clang + llvm-nm on Darwin). + cross_prefix = None + elif force_cross is False: return - raise Exception(f"Could not find cross toolchain {cross_prefix}") + else: + raise Exception(f"Could not find cross toolchain {cross_prefix}") else: cross_prefix = None @@ -2927,6 +3103,8 @@ def update_via_simpasm( arch = "aarch64" elif "armv81m" in infile_full: arch = "armv81m" + elif "ppc64le" in infile_full: + arch = "ppc64le" else: arch = "x86_64" @@ -2939,7 +3117,9 @@ def update_via_simpasm( "-o", tmp.name, ] - cmd += ["--cfify"] + # TODO: Support CFI for ppc64le + if arch != "ppc64le": + cmd += ["--cfify"] if cross_prefix is not None: # Stick with llvm-objdump for disassembly cmd += ["--cc", cross_prefix + "gcc"] @@ -3295,6 +3475,7 @@ def synchronize_backends( delete=False, no_simplify=False, x86_64_syntax="att", + extra_cflags=None, ): if clean is False: ty = "opt" @@ -3400,6 +3581,22 @@ def synchronize_backends( x86_64_syntax=x86_64_syntax, cflags="-Idev/fips202/x86_64 -Imlkem/src/fips202/native/x86_64 -mavx2 -mbmi2 -msse4 -fcf-protection=none", ) + synchronize_backend( + "dev/ppc64le/src", + "mlkem/src/native/ppc64le/src", + delete=delete, + force_cross=force_cross, + no_simplify=no_simplify, + cflags=" ".join( + filter( + None, + [ + extra_cflags, + "-Idev/ppc64le/src -Imlkem/src/native/ppc64le/src -mcpu=power8", + ], + ) + ), + ) def adjust_header_guard_for_filename(content, header_file): @@ -4215,6 +4412,13 @@ def _main(): default="att", help="Assembly syntax for x86_64 disassembly output (att or intel)", ) + parser.add_argument( + "--cflags", + type=str, + default=None, + metavar="FLAGS", + help="Extra CFLAGS prepended to ppc64le simpasm invocations", + ) args = parser.parse_args() @@ -4229,6 +4433,7 @@ def _main(): no_simplify=args.no_simplify, force_cross=args.force_cross, x86_64_syntax=args.x86_64_syntax, + extra_cflags=args.cflags, ) def sync_backends_final(): @@ -4238,6 +4443,7 @@ def _main(): force_cross=args.force_cross, no_simplify=args.no_simplify, x86_64_syntax=args.x86_64_syntax, + extra_cflags=args.cflags, ) # Build step list: (description, function, enabled) @@ -4269,6 +4475,7 @@ def _main(): gen_avx2_keccak_constants_c_file() gen_avx2_keccak_hol_light_constants_file() gen_riscv64_zeta_files() + gen_ppc64le_zeta_files() def gen_monolithic(): gen_monolithic_source_file() diff --git a/scripts/simpasm b/scripts/simpasm index 60305c934b..b93565adb1 100755 --- a/scripts/simpasm +++ b/scripts/simpasm @@ -368,8 +368,8 @@ def simplify(logger, args, asm_input, asm_output=None): logger.debug("Checking that byte-code is unchanged ...") - # When CFI is enabled or for Armv8.1-M, compare only the __text section content - if args.cfify or args.arch == "armv81m": + # When CFI is enabled or for Armv8.1-M/ppc64le, compare only the __text section content + if args.cfify or args.arch in ("armv81m", "ppc64le"): logger.debug("Comparing __text section content for CFI comparison...") # Extract __text section from both files @@ -437,7 +437,9 @@ def _main(): parser.add_argument( "--cc", type=str, default="gcc" if platform.system() != "Darwin" else "clang" ) - parser.add_argument("--nm", type=str, default="nm") + parser.add_argument( + "--nm", type=str, default="nm" if platform.system() != "Darwin" else "llvm-nm" + ) parser.add_argument("--objdump", type=str, default="objdump") parser.add_argument("--strip", type=str, default="llvm-strip") parser.add_argument("--cflags", type=str) diff --git a/test/mk/components.mk b/test/mk/components.mk index 4a3768c6c7..5cc4db8786 100644 --- a/test/mk/components.mk +++ b/test/mk/components.mk @@ -8,7 +8,7 @@ endif SOURCES += $(wildcard mlkem/src/*.c) ifeq ($(OPT),1) - SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) + SOURCES += $(wildcard mlkem/src/native/aarch64/src/*.[csS]) $(wildcard mlkem/src/native/x86_64/src/*.[csS]) $(wildcard mlkem/src/native/riscv64/src/*.[csS]) $(wildcard mlkem/src/native/ppc64le/src/*.[csS]) CFLAGS += -DMLK_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 endif